1e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng#include "cache.h" 2e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng#include "levenshtein.h" 3e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng 4e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng/* 5e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * This function implements the Damerau-Levenshtein algorithm to 6e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * calculate a distance between strings. 7e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 8e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * Basically, it says how many letters need to be swapped, substituted, 9e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * deleted from, or added to string1, at least, to get string2. 10e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 11e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * The idea is to build a distance matrix for the substrings of both 12e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * strings. To avoid a large space complexity, only the last three rows 13e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * are kept in memory (if swaps had the same or higher cost as one deletion 14e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * plus one insertion, only two rows would be needed). 15e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 16e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * At any stage, "i + 1" denotes the length of the current substring of 17e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * string1 that the distance is calculated for. 18e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 19e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * row2 holds the current row, row1 the previous row (i.e. for the substring 20e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * of string1 of length "i"), and row0 the row before that. 21e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 22e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * In other words, at the start of the big loop, row2[j + 1] contains the 23e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * Damerau-Levenshtein distance between the substring of string1 of length 24e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * "i" and the substring of string2 of length "j + 1". 25e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 26e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * All the big loop does is determine the partial minimum-cost paths. 27e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 28e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * It does so by calculating the costs of the path ending in characters 29e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * i (in string1) and j (in string2), respectively, given that the last 30e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * operation is a substition, a swap, a deletion, or an insertion. 31e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 32e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * This implementation allows the costs to be weighted: 33e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 34e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * - w (as in "sWap") 35e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * - s (as in "Substitution") 36e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * - a (for insertion, AKA "Add") 37e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * - d (as in "Deletion") 38e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * 39e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng * Note that this algorithm calculates a distance _iff_ d == a. 40e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng */ 41e6817ec1d8ab31fc7b01906e305f848542df6413Ben Chengint levenshtein(const char *string1, const char *string2, 42e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int w, int s, int a, int d) 43e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng{ 44e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int len1 = strlen(string1), len2 = strlen(string2); 45e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int *row0 = malloc(sizeof(int) * (len2 + 1)); 46e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int *row1 = malloc(sizeof(int) * (len2 + 1)); 47e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int *row2 = malloc(sizeof(int) * (len2 + 1)); 48e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int i, j; 49e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng 50e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng for (j = 0; j <= len2; j++) 51e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row1[j] = j * a; 52e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng for (i = 0; i < len1; i++) { 53e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng int *dummy; 54e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng 55e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2[0] = (i + 1) * d; 56e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng for (j = 0; j < len2; j++) { 57e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng /* substitution */ 58e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); 59e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng /* swap */ 60e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng if (i > 0 && j > 0 && string1[i - 1] == string2[j] && 61e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng string1[i] == string2[j - 1] && 62e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2[j + 1] > row0[j - 1] + w) 63e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2[j + 1] = row0[j - 1] + w; 64e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng /* deletion */ 65e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng if (row2[j + 1] > row1[j + 1] + d) 66e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2[j + 1] = row1[j + 1] + d; 67e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng /* insertion */ 68e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng if (row2[j + 1] > row2[j] + a) 69e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2[j + 1] = row2[j] + a; 70e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng } 71e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng 72e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng dummy = row0; 73e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row0 = row1; 74e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row1 = row2; 75e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng row2 = dummy; 76e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng } 77e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng 78e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng i = row1[len2]; 79e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng free(row0); 80e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng free(row1); 81e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng free(row2); 82e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng 83e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng return i; 84e6817ec1d8ab31fc7b01906e305f848542df6413Ben Cheng} 85