101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===//
201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//
301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//                     The LLVM Compiler Infrastructure
401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//
501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// This file is distributed under the University of Illinois Open Source
601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// License. See LICENSE.TXT for details.
701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//
801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//===----------------------------------------------------------------------===//
901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//
1001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// This file defines a Levenshtein distance function that works for any two
1101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// sequences, with each element of each sequence being analogous to a character
1201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain// in a string.
1301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//
1401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain//===----------------------------------------------------------------------===//
1501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
1601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#ifndef LLVM_ADT_EDIT_DISTANCE_H
1701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#define LLVM_ADT_EDIT_DISTANCE_H
1801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
1901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#include "llvm/ADT/ArrayRef.h"
2001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#include "llvm/ADT/OwningPtr.h"
2101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#include <algorithm>
2201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
2301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrainnamespace llvm {
2401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
2501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \brief Determine the edit distance between two sequences.
2601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain///
2701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param FromArray the first sequence to compare.
2801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain///
2901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param ToArray the second sequence to compare.
3001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain///
3101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param AllowReplacements whether to allow element replacements (change one
3201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// element into another) as a single operation, rather than as two operations
3301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// (an insertion and a removal).
3401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain///
3501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \param MaxEditDistance If non-zero, the maximum edit distance that this
3601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// routine is allowed to compute. If the edit distance will exceed that
3701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// maximum, returns \c MaxEditDistance+1.
3801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain///
3901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \returns the minimum number of element insertions, removals, or (if
4001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// \p AllowReplacements is \c true) replacements needed to transform one of
4101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain/// the given sequences into the other. If zero, the sequences are identical.
4201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhraintemplate<typename T>
4301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrainunsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
4401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain                             bool AllowReplacements = true,
4501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain                             unsigned MaxEditDistance = 0) {
4601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  // The algorithm implemented below is the "classic"
4701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  // dynamic-programming algorithm for computing the Levenshtein
4801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  // distance, which is described here:
4901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  //
5001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  //   http://en.wikipedia.org/wiki/Levenshtein_distance
5101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  //
5201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  // Although the algorithm is typically described using an m x n
5301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  // array, only two rows are used at a time, so this implemenation
5401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  // just keeps two separate vectors for those two rows.
5501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  typename ArrayRef<T>::size_type m = FromArray.size();
5601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  typename ArrayRef<T>::size_type n = ToArray.size();
5701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
5801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  const unsigned SmallBufferSize = 64;
5901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  unsigned SmallBuffer[SmallBufferSize];
6001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  llvm::OwningArrayPtr<unsigned> Allocated;
6101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  unsigned *Previous = SmallBuffer;
6201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  if (2*(n + 1) > SmallBufferSize) {
6301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    Previous = new unsigned [2*(n+1)];
6401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    Allocated.reset(Previous);
6501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  }
6601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  unsigned *Current = Previous + (n + 1);
6701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
6801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  for (unsigned i = 0; i <= n; ++i)
6901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    Previous[i] = i;
7001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
7101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) {
7201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    Current[0] = y;
7301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    unsigned BestThisRow = Current[0];
7401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
7501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) {
7601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain      if (AllowReplacements) {
7701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain        Current[x] = std::min(
7801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain            Previous[x-1] + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u),
7901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain            std::min(Current[x-1], Previous[x])+1);
8001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain      }
8101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain      else {
8201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain        if (FromArray[y-1] == ToArray[x-1]) Current[x] = Previous[x-1];
8301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain        else Current[x] = std::min(Current[x-1], Previous[x]) + 1;
8401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain      }
8501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain      BestThisRow = std::min(BestThisRow, Current[x]);
8601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    }
8701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
8801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    if (MaxEditDistance && BestThisRow > MaxEditDistance)
8901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain      return MaxEditDistance + 1;
9001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
9101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    unsigned *tmp = Current;
9201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    Current = Previous;
9301d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain    Previous = tmp;
9401d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  }
9501d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
9601d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  unsigned Result = Previous[n];
9701d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain  return Result;
9801d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain}
9901d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
10001d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain} // End llvm namespace
10101d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain
10201d53ec176a6be4585df1f43af11151988ca4b35Kaelyn Uhrain#endif
103