1e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include "cache.h" 2e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include "levenshtein.h" 3e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 4e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/* 5e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * This function implements the Damerau-Levenshtein algorithm to 6e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * calculate a distance between strings. 7e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 8e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * Basically, it says how many letters need to be swapped, substituted, 9e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * deleted from, or added to string1, at least, to get string2. 10e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 11e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * The idea is to build a distance matrix for the substrings of both 12e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * strings. To avoid a large space complexity, only the last three rows 13e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * are kept in memory (if swaps had the same or higher cost as one deletion 14e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * plus one insertion, only two rows would be needed). 15e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 16e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * At any stage, "i + 1" denotes the length of the current substring of 17e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * string1 that the distance is calculated for. 18e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 19e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * row2 holds the current row, row1 the previous row (i.e. for the substring 20e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * of string1 of length "i"), and row0 the row before that. 21e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 22e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * In other words, at the start of the big loop, row2[j + 1] contains the 23e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * Damerau-Levenshtein distance between the substring of string1 of length 24e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * "i" and the substring of string2 of length "j + 1". 25e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 26e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * All the big loop does is determine the partial minimum-cost paths. 27e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 28e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * It does so by calculating the costs of the path ending in characters 29e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * i (in string1) and j (in string2), respectively, given that the last 30e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * operation is a substition, a swap, a deletion, or an insertion. 31e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 32e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * This implementation allows the costs to be weighted: 33e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 34e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * - w (as in "sWap") 35e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * - s (as in "Substitution") 36e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * - a (for insertion, AKA "Add") 37e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * - d (as in "Deletion") 38e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 39e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * Note that this algorithm calculates a distance _iff_ d == a. 40e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng */ 41e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Chengint levenshtein(const char *string1, const char *string2, 42e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int w, int s, int a, int d) 43e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng{ 44e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int len1 = strlen(string1), len2 = strlen(string2); 45e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int *row0 = malloc(sizeof(int) * (len2 + 1)); 46e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int *row1 = malloc(sizeof(int) * (len2 + 1)); 47e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int *row2 = malloc(sizeof(int) * (len2 + 1)); 48e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int i, j; 49e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 50e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng for (j = 0; j <= len2; j++) 51e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row1[j] = j * a; 52e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng for (i = 0; i < len1; i++) { 53e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng int *dummy; 54e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 55e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2[0] = (i + 1) * d; 56e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng for (j = 0; j < len2; j++) { 57e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* substitution */ 58e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); 59e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* swap */ 60e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng if (i > 0 && j > 0 && string1[i - 1] == string2[j] && 61e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng string1[i] == string2[j - 1] && 62e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2[j + 1] > row0[j - 1] + w) 63e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2[j + 1] = row0[j - 1] + w; 64e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* deletion */ 65e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng if (row2[j + 1] > row1[j + 1] + d) 66e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2[j + 1] = row1[j + 1] + d; 67e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* insertion */ 68e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng if (row2[j + 1] > row2[j] + a) 69e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2[j + 1] = row2[j] + a; 70e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng } 71e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 72e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng dummy = row0; 73e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row0 = row1; 74e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row1 = row2; 75e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng row2 = dummy; 76e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng } 77e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 78e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng i = row1[len2]; 79e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng free(row0); 80e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng free(row1); 81e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng free(row2); 82e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 83e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng return i; 84e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng} 85