Levenshtein distance

From Lazarus wiki
Jump to navigationJump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

English (en)

The Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertion, deletion, substitution) required to change one word into the other.

This works on both ASCII and UTF-8 encoding. This implementations use two-dimensional dynamic array to store the distances of prefixes of the words compared.


uses
  Classes, SysUtils, Math, LazUTF8;

function LevenshteinDistance(const s1 : string; s2 : string) : integer;
function LevenshteinDistanceText(const s1, s2: string): Integer;

implementation

{------------------------------------------------------------------------------
  Name:    LevenshteinDistance
  Params: s1, s2 - UTF8 encoded strings
  Returns: Minimum number of single-character edits.
  Compare 2 UTF8 encoded strings, case sensitive.
------------------------------------------------------------------------------}

function LevenshteinDistance(const s1 : string; s2 : string) : integer;
var
  length1, length2, i, j ,
  value1, value2, value3 : integer;
  matrix : array of array of integer;
begin
  length1 := UTF8Length( s1 );
  length2 := UTF8Length( s2 );
  SetLength (matrix, length1 + 1, length2 + 1);
  for i := 0 to length1 do matrix [i, 0] := i;
  for j := 0 to length2 do matrix [0, j] := j;
  for i := 1 to length1 do
    for j := 1 to length2 do
      begin
        if UTF8Copy( s1, i, 1) = UTF8Copy( s2, j, 1 )
          then matrix[i,j] := matrix[i-1,j-1]
          else  begin
            value1 := matrix [i-1, j] + 1;
            value2 := matrix [i, j-1] + 1;
            value3 := matrix[i-1, j-1] + 1;
            matrix [i, j] := min( value1, min( value2, value3 ));
          end;
      end;
  result := matrix [length1, length2];
end;

{------------------------------------------------------------------------------
  Name:    LevenshteinDistanceText
  Params: s1, s2 - UTF8 encoded strings
  Returns: Minimum number of single-character edits.
  Compare 2 UTF8 encoded strings, case insensitive.
------------------------------------------------------------------------------}
function LevenshteinDistanceText(const s1, s2: string): integer;
var
  s1lower, s2lower: string;
begin
  s1lower := UTF8LowerCase( s1 );
  s2lower := UTF8LowerCase( s2 );
  result := LevenshteinDistance( s1lower, s2lower );
end;

end.