101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// 201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// 301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// The LLVM Compiler Infrastructure 401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// 501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// This file is distributed under the University of Illinois Open Source 601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// License. See LICENSE.TXT for details. 701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// 801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//===----------------------------------------------------------------------===// 901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// 1001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// This file defines a Levenshtein distance function that works for any two 1101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// sequences, with each element of each sequence being analogous to a character 1201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// in a string. 1301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// 1401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//===----------------------------------------------------------------------===// 1501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 1601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#ifndef LLVM_ADT_EDIT_DISTANCE_H 1701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#define LLVM_ADT_EDIT_DISTANCE_H 1801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 1901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#include "llvm/ADT/ArrayRef.h" 2001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#include "llvm/ADT/OwningPtr.h" 2101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#include <algorithm> 2201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 2301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrainnamespace llvm { 2401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 2501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \brief Determine the edit distance between two sequences. 2601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// 2701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param FromArray the first sequence to compare. 2801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// 2901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param ToArray the second sequence to compare. 3001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// 3101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param AllowReplacements whether to allow element replacements (change one 3201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// element into another) as a single operation, rather than as two operations 3301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// (an insertion and a removal). 3401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// 3501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param MaxEditDistance If non-zero, the maximum edit distance that this 3601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// routine is allowed to compute. If the edit distance will exceed that 3701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// maximum, returns \c MaxEditDistance+1. 3801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// 3901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \returns the minimum number of element insertions, removals, or (if 4001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \p AllowReplacements is \c true) replacements needed to transform one of 4101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// the given sequences into the other. If zero, the sequences are identical. 4201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhraintemplate<typename T> 4301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrainunsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 4401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain bool AllowReplacements = true, 4501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned MaxEditDistance = 0) { 4601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // The algorithm implemented below is the "classic" 4701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // dynamic-programming algorithm for computing the Levenshtein 4801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // distance, which is described here: 4901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // 5001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // http://en.wikipedia.org/wiki/Levenshtein_distance 5101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // 5201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // Although the algorithm is typically described using an m x n 5301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // array, only two rows are used at a time, so this implemenation 5401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain // just keeps two separate vectors for those two rows. 5501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain typename ArrayRef<T>::size_type m = FromArray.size(); 5601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain typename ArrayRef<T>::size_type n = ToArray.size(); 5701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 5801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain const unsigned SmallBufferSize = 64; 5901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned SmallBuffer[SmallBufferSize]; 6001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain llvm::OwningArrayPtr<unsigned> Allocated; 6101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned *Previous = SmallBuffer; 6201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain if (2*(n + 1) > SmallBufferSize) { 6301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Previous = new unsigned [2*(n+1)]; 6401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Allocated.reset(Previous); 6501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain } 6601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned *Current = Previous + (n + 1); 6701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 6801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain for (unsigned i = 0; i <= n; ++i) 6901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Previous[i] = i; 7001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 7101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { 7201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Current[0] = y; 7301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned BestThisRow = Current[0]; 7401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 7501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { 7601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain if (AllowReplacements) { 7701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Current[x] = std::min( 7801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Previous[x-1] + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u), 7901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain std::min(Current[x-1], Previous[x])+1); 8001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain } 8101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain else { 8201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain if (FromArray[y-1] == ToArray[x-1]) Current[x] = Previous[x-1]; 8301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain else Current[x] = std::min(Current[x-1], Previous[x]) + 1; 8401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain } 8501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain BestThisRow = std::min(BestThisRow, Current[x]); 8601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain } 8701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 8801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain if (MaxEditDistance && BestThisRow > MaxEditDistance) 8901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain return MaxEditDistance + 1; 9001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 9101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned *tmp = Current; 9201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Current = Previous; 9301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain Previous = tmp; 9401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain } 9501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 9601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain unsigned Result = Previous[n]; 9701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain return Result; 9801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain} 9901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 10001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain} // End llvm namespace 10101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain 10201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#endif 103