Friday, December 26, 2008

Levenshtein distance - Code Snippets - Wikibooks

Levenshtein distance - Code Snippets - Wikibooks

VBA (with Damerau extension)
This VBA version uses recursion and a maximum allowed distance. The analysis to identify quasi-duplicated strings or spelling mistakes can be narrowed down to a corridor in the result matrix. The Damerau extension has also been added to the Levenshtein algorithm.


Function damerau_levenshtein(s1 As String, s2 As String, Optional limit As Variant, Optional result As Variant) As Integer
'This function returns the Levenshtein distance capped by the limit parameter.
'Usage : e.g. damerau_levenshtein("Thibault","Gorisse") to get the exact distance
' or damerau_levenshtein("correctly written words","corectly writen words",4) to identify similar spellings

Dim diagonal As Integer
Dim horizontal As Integer
Dim vertical As Integer
Dim swap As Integer
Dim final As Integer

'set a maximum limit if not set
If IsMissing(limit) Then
limit = Len(s1) + Len(s2)
End If

'create the result matrix to store intermediary results
If IsMissing(result) Then
Dim i, j As Integer ' pointeur
ReDim result(Len(s1), Len(s2)) As Integer
End If

'Start of the strings analysis
If result(Len(s1), Len(s2)) <>= limit Then
final = limit
Else
If Len(s1) = 0 Or Len(s2) = 0 Then
'End of recursivity
final = Len(s1) + Len(s2)
Else

'Core of levenshtein algorithm
If Mid(s1, 1, 1) = Mid(s2, 1, 1) Then
final = damerau_levenshtein(Mid(s1, 2), Mid(s2, 2), limit, result)
Else

If Mid(s1, 1, 1) = Mid(s2, 2, 1) And Mid(s1, 2, 1) = Mid(s2, 1, 1) Then
'Damerau extension counting swapped letters
swap = damerau_levenshtein(Mid(s1, 3), Mid(s2, 3), limit - 1, result)
final = 1 + swap
Else
'The function minimum is implemented via the limit parameter.
'The diagonal search usually reaches the limit the quickest.
diagonal = damerau_levenshtein(Mid(s1, 2), Mid(s2, 2), limit - 1, result)
horizontal = damerau_levenshtein(Mid(s1, 2), s2, diagonal, result)
vertical = damerau_levenshtein(s1, Mid(s2, 2), horizontal, result)
final = 1 + vertical
End If
End If

End If
End If
Else
'retrieve intermediate result
final = result(Len(s1), Len(s2)) - 1
End If

'returns the distance capped by the limit
If final < limit Then
damerau_levenshtein = final
'store intermediate result
result(Len(s1), Len(s2)) = final + 1
Else
damerau_levenshtein = limit
End If

End Function

No comments: