16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copyright (C) 1999-2011, International Business Machines 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Corporation and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Date Name Description 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 11/17/99 aliu Creation. 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_TRANSLITERATION 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/rep.h" 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/unifilt.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "rbt_rule.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "rbt_data.h" 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h" 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "strmatch.h" 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "strrepl.h" 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "util.h" 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "putilimp.h" 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const UChar FORWARD_OP[] = {32,62,32,0}; // " > " 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Construct a new rule with the given input, output text, and other 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * attributes. A cursor position may be specified for the output text. 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param input input string, including key and optional ante and 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * post context 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param anteContextPos offset into input to end of ante context, or -1 if 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * none. Must be <= input.length() if not -1. 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param postContextPos offset into input to start of post context, or -1 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * if none. Must be <= input.length() if not -1, and must be >= 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * anteContextPos. 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param output output string 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param cursorPosition offset into output at which cursor is located, or -1 if 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * none. If less than zero, then the cursor is placed after the 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <code>output</code>; that is, -1 is equivalent to 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <code>output.length()</code>. If greater than 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <code>output.length()</code> then an exception is thrown. 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param segs array of UnicodeFunctors corresponding to input pattern 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * segments, or null if there are none. The array itself is adopted, 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * but the pointers within it are not. 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param segsCount number of elements in segs[] 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param anchorStart TRUE if the the rule is anchored on the left to 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the context start 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param anchorEnd TRUE if the rule is anchored on the right to the 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * context limit 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterationRule::TransliterationRule(const UnicodeString& input, 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t anteContextPos, int32_t postContextPos, 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString& outputStr, 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cursorPosition, int32_t cursorOffset, 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeFunctor** segs, 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t segsCount, 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool anchorStart, UBool anchorEnd, 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const TransliterationRuleData* theData, 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode& status) : 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UMemory(), 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segments(0), 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org data(theData) { 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Do range checks only when warranted to save time 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContextPos < 0) { 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContextLength = 0; 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContextPos > input.length()) { 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // throw new IllegalArgumentException("Invalid ante context"); 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ILLEGAL_ARGUMENT_ERROR; 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContextLength = anteContextPos; 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (postContextPos < 0) { 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org keyLength = input.length() - anteContextLength; 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (postContextPos < anteContextLength || 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org postContextPos > input.length()) { 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // throw new IllegalArgumentException("Invalid post context"); 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ILLEGAL_ARGUMENT_ERROR; 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org keyLength = postContextPos - anteContextLength; 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (cursorPosition < 0) { 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cursorPosition = outputStr.length(); 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if (cursorPosition > outputStr.length()) { 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // throw new IllegalArgumentException("Invalid cursor position"); 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ILLEGAL_ARGUMENT_ERROR; 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We don't validate the segments array. The caller must 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // guarantee that the segments are well-formed (that is, that 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // all $n references in the output refer to indices of this 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // array, and that no array elements are null). 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this->segments = segs; 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this->segmentsCount = segsCount; 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pattern = input; 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org flags = 0; 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anchorStart) { 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org flags |= ANCHOR_START; 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anchorEnd) { 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org flags |= ANCHOR_END; 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContext = NULL; 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContextLength > 0) { 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContext = new StringMatcher(pattern, 0, anteContextLength, 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FALSE, *data); 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* test for NULL */ 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContext == 0) { 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org key = NULL; 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (keyLength > 0) { 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FALSE, *data); 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* test for NULL */ 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (key == 0) { 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t postContextLength = pattern.length() - keyLength - anteContextLength; 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org postContext = NULL; 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (postContextLength > 0) { 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FALSE, *data); 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* test for NULL */ 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (postContext == 0) { 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* test for NULL */ 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (this->output == 0) { 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copy constructor. 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterationRule::TransliterationRule(TransliterationRule& other) : 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UMemory(other), 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContext(NULL), 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org key(NULL), 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org postContext(NULL), 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pattern(other.pattern), 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContextLength(other.anteContextLength), 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org keyLength(other.keyLength), 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org flags(other.flags), 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org data(other.data) { 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segments = NULL; 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segmentsCount = 0; 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (other.segmentsCount > 0) { 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (other.anteContext != NULL) { 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anteContext = (StringMatcher*) other.anteContext->clone(); 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (other.key != NULL) { 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org key = (StringMatcher*) other.key->clone(); 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (other.postContext != NULL) { 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org postContext = (StringMatcher*) other.postContext->clone(); 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output = other.output->clone(); 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterationRule::~TransliterationRule() { 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(segments); 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete anteContext; 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete key; 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete postContext; 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete output; 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Return the preceding context length. This method is needed to 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * support the <code>Transliterator</code> method 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <code>getMaximumContextLength()</code>. Internally, this is 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * implemented as the anteContextLength, optionally plus one if 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * there is a start anchor. The one character anchor gap is 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * needed to make repeated incremental transliteration with 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * anchors work. 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t TransliterationRule::getContextLength(void) const { 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Internal method. Returns 8-bit index value for this rule. 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This is the low byte of the first character of the key, 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * unless the first character of the key is a set. If it's a 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * set, or otherwise can match multiple keys, the index value is -1. 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint16_t TransliterationRule::getIndexValue() const { 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContextLength == pattern.length()) { 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // A pattern with just ante context {such as foo)>bar} can 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // match any key. 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c = pattern.char32At(anteContextLength); 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Internal method. Returns true if this rule matches the given 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * index value. The index value is an 8-bit integer, 0..255, 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * representing the low byte of the first character of the key. 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * It matches this rule if it matches the first character of the 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * key, or if the first character of the key is a set, and the set 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * contains any character with a low byte equal to the index 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * value. If the rule contains only ante context, as in foo)>bar, 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then it will match any key. 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool TransliterationRule::matchesIndexValue(uint8_t v) const { 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Delegate to the key, or if there is none, to the postContext. 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If there is neither then we match any key; return true. 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeMatcher *m = (key != NULL) ? key : postContext; 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (m != NULL) ? m->matchesIndexValue(v) : TRUE; 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Return true if this rule masks another rule. If r1 masks r2 then 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * "[c]a>x" masks "[dc]a>y". 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool TransliterationRule::masks(const TransliterationRule& r2) const { 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Rule r1 masks rule r2 if the string formed of the 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * antecontext, key, and postcontext overlaps in the following 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * way: 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * r1: aakkkpppp 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * r2: aaakkkkkpppp 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ^ 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The strings must be aligned at the first character of the 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * key. The length of r1 to the left of the alignment point 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * must be <= the length of r2 to the left; ditto for the 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * right. The characters of r1 must equal (or be a superset 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * of) the corresponding characters of r2. The superset 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * operation should be performed to check for UnicodeSet 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * masking. 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Anchors: Two patterns that differ only in anchors only 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * mask one another if they are exactly equal, and r2 has 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * all the anchors r1 has (optionally, plus some). Here Y 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * means the row masks the column, N means it doesn't. 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ab ^ab ab$ ^ab$ 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ab Y Y Y Y 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ^ab N Y N Y 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ab$ N N Y Y 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ^ab$ N N N Y 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Post context: {a}b masks ab, but not vice versa, since {a}b 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * matches everything ab matches, and {a}b matches {|a|}b but ab 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * does not. Pre context is different (a{b} does not align with 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ab). 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* LIMITATION of the current mask algorithm: Some rule 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * maskings are currently not detected. For example, 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len = pattern.length(); 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t left = anteContextLength; 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t left2 = r2.anteContextLength; 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t right = len - left; 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t right2 = r2.pattern.length() - left2; 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO Clean this up -- some logic might be combinable with the 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // next statement. 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Test for anchor masking 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (left == left2 && right == right2 && 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org keyLength <= r2.keyLength && 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0 == cachedCompare) { 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The following boolean logic implements the table above 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (flags == r2.flags) || 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return left <= left2 && 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (right < right2 || 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (right == right2 && keyLength <= r2.keyLength)) && 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (0 == cachedCompare); 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t posBefore(const Replaceable& str, int32_t pos) { 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (pos > 0) ? 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos - U16_LENGTH(str.char32At(pos-1)) : 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos - 1; 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t posAfter(const Replaceable& str, int32_t pos) { 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (pos >= 0 && pos < str.length()) ? 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos + U16_LENGTH(str.char32At(pos)) : 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos + 1; 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Attempt a match and replacement at the given position. Return 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the degree of match between this rule and the given text. The 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * degree of match may be mismatch, a partial match, or a full 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * match. A mismatch means at least one character of the text 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * does not match the context or key. A partial match means some 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * context and key characters match, but the text is not long 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * enough to match all of them. A full match means all context 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and key characters match. 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * If a full match is obtained, perform a replacement, update pos, 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and return U_MATCH. Otherwise both text and pos are unchanged. 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param text the text 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param pos the position indices 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param incremental if TRUE, test for partial matches that may 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * be completed by additional text inserted at pos.limit. 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return one of <code>U_MISMATCH</code>, 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UTransPosition& pos, 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool incremental) const { 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Matching and replacing are done in one method because the 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // replacement operation needs information obtained during the 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // match. Another way to do this is to have the match method 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // create a match result struct with relevant offsets, and to pass 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // this into the replace method. 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ============================ MATCH =========================== 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Reset segment match data 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (segments != NULL) { 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i=0; i<segmentsCount; ++i) { 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ((StringMatcher*) segments[i])->resetMatch(); 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// int32_t lenDelta, keyLimit; 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t keyLimit; 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ------------------------ Ante Context ------------------------ 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // A mismatch in the ante context, or with the start anchor, 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // is an outright U_MISMATCH regardless of whether we are 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // incremental or not. 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t oText; // offset into 'text' 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// int32_t newStart = 0; 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t minOText; 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Note (1): We process text in 16-bit code units, rather than 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 32-bit code points. This works because stand-ins are 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // always in the BMP and because we are doing a literal match 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // operation, which can be done 16-bits at a time. 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t anteLimit = posBefore(text, pos.contextStart); 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UMatchDegree match; 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Start reverse match at char before pos.start 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org oText = posBefore(text, pos.start); 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContext != NULL) { 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = anteContext->matches(text, oText, anteLimit, FALSE); 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match != U_MATCH) { 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return U_MISMATCH; 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org minOText = posAfter(text, oText); 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ------------------------ Start Anchor ------------------------ 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return U_MISMATCH; 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // -------------------- Key and Post Context -------------------- 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org oText = pos.start; 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (key != NULL) { 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = key->matches(text, oText, pos.limit, incremental); 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match != U_MATCH) { 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return match; 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org keyLimit = oText; 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (postContext != NULL) { 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (incremental && keyLimit == pos.limit) { 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The key matches just before pos.limit, and there is 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // a postContext. Since we are in incremental mode, 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // we must assume more characters may be inserted at 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // pos.limit -- this is a partial match. 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return U_PARTIAL_MATCH; 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = postContext->matches(text, oText, pos.contextLimit, incremental); 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match != U_MATCH) { 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return match; 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ------------------------- Stop Anchor ------------------------ 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (((flags & ANCHOR_END)) != 0) { 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (oText != pos.contextLimit) { 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return U_MISMATCH; 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (incremental) { 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return U_PARTIAL_MATCH; 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // =========================== REPLACE ========================== 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We have a full match. The key is between pos.start and 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // keyLimit. 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t newStart; 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lenDelta = newLength - (keyLimit - pos.start); 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org oText += lenDelta; 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos.limit += lenDelta; 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos.contextLimit += lenDelta; 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Restrict new value of start to [minOText, min(oText, pos.limit)]. 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return U_MATCH; 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Create a source string that represents this rule. Append it to the 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * given string. 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeString& TransliterationRule::toRule(UnicodeString& rule, 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool escapeUnprintable) const { 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Accumulate special characters (and non-specials following them) 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // into quoteBuf. Append quoteBuf, within single quotes, when 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // a non-quoted element must be inserted. 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString str, quoteBuf; 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Do not emit the braces '{' '}' around the pattern if there 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // is neither anteContext nor postContext. 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool emitBraces = 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (anteContext != NULL) || (postContext != NULL); 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Emit start anchor 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((flags & ANCHOR_START) != 0) { 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rule.append((UChar)94/*^*/); 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Emit the input pattern 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (emitBraces) { 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (emitBraces) { 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Emit end anchor 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((flags & ANCHOR_END) != 0) { 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rule.append((UChar)36/*$*/); 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Emit the output pattern 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TRUE, escapeUnprintable, quoteBuf); 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return rule; 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid TransliterationRule::setData(const TransliterationRuleData* d) { 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org data = d; 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (anteContext != NULL) anteContext->setData(d); 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (postContext != NULL) postContext->setData(d); 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (key != NULL) key->setData(d); 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // assert(output != NULL); 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output->setData(d); 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Don't have to do segments since they are in the context or key 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Union the set of all characters that may be modified by this rule 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * into the given set. 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limit = anteContextLength + keyLength; 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i=anteContextLength; i<limit; ) { 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 ch = pattern.char32At(i); 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i += U16_LENGTH(ch); 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeMatcher* matcher = data->lookupMatcher(ch); 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (matcher == NULL) { 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org toUnionTo.add(ch); 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matcher->addMatchSetTo(toUnionTo); 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Union the set of all characters that may be emitted by this rule 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * into the given set. 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output->toReplacer()->addReplacementSetTo(toUnionTo); 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//eof 558