1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **********************************************************************
5 *   Copyright (C) 1999-2011, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/17/99    aliu        Creation.
10 **********************************************************************
11 */
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "unicode/rep.h"
18#include "unicode/unifilt.h"
19#include "unicode/uniset.h"
20#include "unicode/utf16.h"
21#include "rbt_rule.h"
22#include "rbt_data.h"
23#include "cmemory.h"
24#include "strmatch.h"
25#include "strrepl.h"
26#include "util.h"
27#include "putilimp.h"
28
29static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "
30
31U_NAMESPACE_BEGIN
32
33/**
34 * Construct a new rule with the given input, output text, and other
35 * attributes.  A cursor position may be specified for the output text.
36 * @param input input string, including key and optional ante and
37 * post context
38 * @param anteContextPos offset into input to end of ante context, or -1 if
39 * none.  Must be <= input.length() if not -1.
40 * @param postContextPos offset into input to start of post context, or -1
41 * if none.  Must be <= input.length() if not -1, and must be >=
42 * anteContextPos.
43 * @param output output string
44 * @param cursorPosition offset into output at which cursor is located, or -1 if
45 * none.  If less than zero, then the cursor is placed after the
46 * <code>output</code>; that is, -1 is equivalent to
47 * <code>output.length()</code>.  If greater than
48 * <code>output.length()</code> then an exception is thrown.
49 * @param segs array of UnicodeFunctors corresponding to input pattern
50 * segments, or null if there are none.  The array itself is adopted,
51 * but the pointers within it are not.
52 * @param segsCount number of elements in segs[]
53 * @param anchorStart TRUE if the the rule is anchored on the left to
54 * the context start
55 * @param anchorEnd TRUE if the rule is anchored on the right to the
56 * context limit
57 */
58TransliterationRule::TransliterationRule(const UnicodeString& input,
59                                         int32_t anteContextPos, int32_t postContextPos,
60                                         const UnicodeString& outputStr,
61                                         int32_t cursorPosition, int32_t cursorOffset,
62                                         UnicodeFunctor** segs,
63                                         int32_t segsCount,
64                                         UBool anchorStart, UBool anchorEnd,
65                                         const TransliterationRuleData* theData,
66                                         UErrorCode& status) :
67    UMemory(),
68    segments(0),
69    data(theData) {
70
71    if (U_FAILURE(status)) {
72        return;
73    }
74    // Do range checks only when warranted to save time
75    if (anteContextPos < 0) {
76        anteContextLength = 0;
77    } else {
78        if (anteContextPos > input.length()) {
79            // throw new IllegalArgumentException("Invalid ante context");
80            status = U_ILLEGAL_ARGUMENT_ERROR;
81            return;
82        }
83        anteContextLength = anteContextPos;
84    }
85    if (postContextPos < 0) {
86        keyLength = input.length() - anteContextLength;
87    } else {
88        if (postContextPos < anteContextLength ||
89            postContextPos > input.length()) {
90            // throw new IllegalArgumentException("Invalid post context");
91            status = U_ILLEGAL_ARGUMENT_ERROR;
92            return;
93        }
94        keyLength = postContextPos - anteContextLength;
95    }
96    if (cursorPosition < 0) {
97        cursorPosition = outputStr.length();
98    } else if (cursorPosition > outputStr.length()) {
99        // throw new IllegalArgumentException("Invalid cursor position");
100        status = U_ILLEGAL_ARGUMENT_ERROR;
101        return;
102    }
103    // We don't validate the segments array.  The caller must
104    // guarantee that the segments are well-formed (that is, that
105    // all $n references in the output refer to indices of this
106    // array, and that no array elements are null).
107    this->segments = segs;
108    this->segmentsCount = segsCount;
109
110    pattern = input;
111    flags = 0;
112    if (anchorStart) {
113        flags |= ANCHOR_START;
114    }
115    if (anchorEnd) {
116        flags |= ANCHOR_END;
117    }
118
119    anteContext = NULL;
120    if (anteContextLength > 0) {
121        anteContext = new StringMatcher(pattern, 0, anteContextLength,
122                                        FALSE, *data);
123        /* test for NULL */
124        if (anteContext == 0) {
125            status = U_MEMORY_ALLOCATION_ERROR;
126            return;
127        }
128    }
129
130    key = NULL;
131    if (keyLength > 0) {
132        key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,
133                                FALSE, *data);
134        /* test for NULL */
135        if (key == 0) {
136            status = U_MEMORY_ALLOCATION_ERROR;
137            return;
138        }
139    }
140
141    int32_t postContextLength = pattern.length() - keyLength - anteContextLength;
142    postContext = NULL;
143    if (postContextLength > 0) {
144        postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),
145                                        FALSE, *data);
146        /* test for NULL */
147        if (postContext == 0) {
148            status = U_MEMORY_ALLOCATION_ERROR;
149            return;
150        }
151    }
152
153    this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);
154    /* test for NULL */
155    if (this->output == 0) {
156        status = U_MEMORY_ALLOCATION_ERROR;
157        return;
158    }
159}
160
161/**
162 * Copy constructor.
163 */
164TransliterationRule::TransliterationRule(TransliterationRule& other) :
165    UMemory(other),
166    anteContext(NULL),
167    key(NULL),
168    postContext(NULL),
169    pattern(other.pattern),
170    anteContextLength(other.anteContextLength),
171    keyLength(other.keyLength),
172    flags(other.flags),
173    data(other.data) {
174
175    segments = NULL;
176    segmentsCount = 0;
177    if (other.segmentsCount > 0) {
178        segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));
179        uprv_memcpy(segments, other.segments, (size_t)other.segmentsCount*sizeof(segments[0]));
180    }
181
182    if (other.anteContext != NULL) {
183        anteContext = (StringMatcher*) other.anteContext->clone();
184    }
185    if (other.key != NULL) {
186        key = (StringMatcher*) other.key->clone();
187    }
188    if (other.postContext != NULL) {
189        postContext = (StringMatcher*) other.postContext->clone();
190    }
191    output = other.output->clone();
192}
193
194TransliterationRule::~TransliterationRule() {
195    uprv_free(segments);
196    delete anteContext;
197    delete key;
198    delete postContext;
199    delete output;
200}
201
202/**
203 * Return the preceding context length.  This method is needed to
204 * support the <code>Transliterator</code> method
205 * <code>getMaximumContextLength()</code>.  Internally, this is
206 * implemented as the anteContextLength, optionally plus one if
207 * there is a start anchor.  The one character anchor gap is
208 * needed to make repeated incremental transliteration with
209 * anchors work.
210 */
211int32_t TransliterationRule::getContextLength(void) const {
212    return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
213}
214
215/**
216 * Internal method.  Returns 8-bit index value for this rule.
217 * This is the low byte of the first character of the key,
218 * unless the first character of the key is a set.  If it's a
219 * set, or otherwise can match multiple keys, the index value is -1.
220 */
221int16_t TransliterationRule::getIndexValue() const {
222    if (anteContextLength == pattern.length()) {
223        // A pattern with just ante context {such as foo)>bar} can
224        // match any key.
225        return -1;
226    }
227    UChar32 c = pattern.char32At(anteContextLength);
228    return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);
229}
230
231/**
232 * Internal method.  Returns true if this rule matches the given
233 * index value.  The index value is an 8-bit integer, 0..255,
234 * representing the low byte of the first character of the key.
235 * It matches this rule if it matches the first character of the
236 * key, or if the first character of the key is a set, and the set
237 * contains any character with a low byte equal to the index
238 * value.  If the rule contains only ante context, as in foo)>bar,
239 * then it will match any key.
240 */
241UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
242    // Delegate to the key, or if there is none, to the postContext.
243    // If there is neither then we match any key; return true.
244    UnicodeMatcher *m = (key != NULL) ? key : postContext;
245    return (m != NULL) ? m->matchesIndexValue(v) : TRUE;
246}
247
248/**
249 * Return true if this rule masks another rule.  If r1 masks r2 then
250 * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
251 * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
252 * "[c]a>x" masks "[dc]a>y".
253 */
254UBool TransliterationRule::masks(const TransliterationRule& r2) const {
255    /* Rule r1 masks rule r2 if the string formed of the
256     * antecontext, key, and postcontext overlaps in the following
257     * way:
258     *
259     * r1:      aakkkpppp
260     * r2:     aaakkkkkpppp
261     *            ^
262     *
263     * The strings must be aligned at the first character of the
264     * key.  The length of r1 to the left of the alignment point
265     * must be <= the length of r2 to the left; ditto for the
266     * right.  The characters of r1 must equal (or be a superset
267     * of) the corresponding characters of r2.  The superset
268     * operation should be performed to check for UnicodeSet
269     * masking.
270     *
271     * Anchors:  Two patterns that differ only in anchors only
272     * mask one another if they are exactly equal, and r2 has
273     * all the anchors r1 has (optionally, plus some).  Here Y
274     * means the row masks the column, N means it doesn't.
275     *
276     *         ab   ^ab    ab$  ^ab$
277     *   ab    Y     Y     Y     Y
278     *  ^ab    N     Y     N     Y
279     *   ab$   N     N     Y     Y
280     *  ^ab$   N     N     N     Y
281     *
282     * Post context: {a}b masks ab, but not vice versa, since {a}b
283     * matches everything ab matches, and {a}b matches {|a|}b but ab
284     * does not.  Pre context is different (a{b} does not align with
285     * ab).
286     */
287
288    /* LIMITATION of the current mask algorithm: Some rule
289     * maskings are currently not detected.  For example,
290     * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO
291     */
292
293    int32_t len = pattern.length();
294    int32_t left = anteContextLength;
295    int32_t left2 = r2.anteContextLength;
296    int32_t right = len - left;
297    int32_t right2 = r2.pattern.length() - left2;
298    int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);
299
300    // TODO Clean this up -- some logic might be combinable with the
301    // next statement.
302
303    // Test for anchor masking
304    if (left == left2 && right == right2 &&
305        keyLength <= r2.keyLength &&
306        0 == cachedCompare) {
307        // The following boolean logic implements the table above
308        return (flags == r2.flags) ||
309            (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
310            ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
311    }
312
313    return left <= left2 &&
314        (right < right2 ||
315         (right == right2 && keyLength <= r2.keyLength)) &&
316         (0 == cachedCompare);
317}
318
319static inline int32_t posBefore(const Replaceable& str, int32_t pos) {
320    return (pos > 0) ?
321        pos - U16_LENGTH(str.char32At(pos-1)) :
322        pos - 1;
323}
324
325static inline int32_t posAfter(const Replaceable& str, int32_t pos) {
326    return (pos >= 0 && pos < str.length()) ?
327        pos + U16_LENGTH(str.char32At(pos)) :
328        pos + 1;
329}
330
331/**
332 * Attempt a match and replacement at the given position.  Return
333 * the degree of match between this rule and the given text.  The
334 * degree of match may be mismatch, a partial match, or a full
335 * match.  A mismatch means at least one character of the text
336 * does not match the context or key.  A partial match means some
337 * context and key characters match, but the text is not long
338 * enough to match all of them.  A full match means all context
339 * and key characters match.
340 *
341 * If a full match is obtained, perform a replacement, update pos,
342 * and return U_MATCH.  Otherwise both text and pos are unchanged.
343 *
344 * @param text the text
345 * @param pos the position indices
346 * @param incremental if TRUE, test for partial matches that may
347 * be completed by additional text inserted at pos.limit.
348 * @return one of <code>U_MISMATCH</code>,
349 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
350 * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
351 */
352UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
353                                                  UTransPosition& pos,
354                                                  UBool incremental) const {
355    // Matching and replacing are done in one method because the
356    // replacement operation needs information obtained during the
357    // match.  Another way to do this is to have the match method
358    // create a match result struct with relevant offsets, and to pass
359    // this into the replace method.
360
361    // ============================ MATCH ===========================
362
363    // Reset segment match data
364    if (segments != NULL) {
365        for (int32_t i=0; i<segmentsCount; ++i) {
366            ((StringMatcher*) segments[i])->resetMatch();
367        }
368    }
369
370//    int32_t lenDelta, keyLimit;
371    int32_t keyLimit;
372
373    // ------------------------ Ante Context ------------------------
374
375    // A mismatch in the ante context, or with the start anchor,
376    // is an outright U_MISMATCH regardless of whether we are
377    // incremental or not.
378    int32_t oText; // offset into 'text'
379//    int32_t newStart = 0;
380    int32_t minOText;
381
382    // Note (1): We process text in 16-bit code units, rather than
383    // 32-bit code points.  This works because stand-ins are
384    // always in the BMP and because we are doing a literal match
385    // operation, which can be done 16-bits at a time.
386
387    int32_t anteLimit = posBefore(text, pos.contextStart);
388
389    UMatchDegree match;
390
391    // Start reverse match at char before pos.start
392    oText = posBefore(text, pos.start);
393
394    if (anteContext != NULL) {
395        match = anteContext->matches(text, oText, anteLimit, FALSE);
396        if (match != U_MATCH) {
397            return U_MISMATCH;
398        }
399    }
400
401    minOText = posAfter(text, oText);
402
403    // ------------------------ Start Anchor ------------------------
404
405    if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
406        return U_MISMATCH;
407    }
408
409    // -------------------- Key and Post Context --------------------
410
411    oText = pos.start;
412
413    if (key != NULL) {
414        match = key->matches(text, oText, pos.limit, incremental);
415        if (match != U_MATCH) {
416            return match;
417        }
418    }
419
420    keyLimit = oText;
421
422    if (postContext != NULL) {
423        if (incremental && keyLimit == pos.limit) {
424            // The key matches just before pos.limit, and there is
425            // a postContext.  Since we are in incremental mode,
426            // we must assume more characters may be inserted at
427            // pos.limit -- this is a partial match.
428            return U_PARTIAL_MATCH;
429        }
430
431        match = postContext->matches(text, oText, pos.contextLimit, incremental);
432        if (match != U_MATCH) {
433            return match;
434        }
435    }
436
437    // ------------------------- Stop Anchor ------------------------
438
439    if (((flags & ANCHOR_END)) != 0) {
440        if (oText != pos.contextLimit) {
441            return U_MISMATCH;
442        }
443        if (incremental) {
444            return U_PARTIAL_MATCH;
445        }
446    }
447
448    // =========================== REPLACE ==========================
449
450    // We have a full match.  The key is between pos.start and
451    // keyLimit.
452
453    int32_t newStart;
454    int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);
455    int32_t lenDelta = newLength - (keyLimit - pos.start);
456
457    oText += lenDelta;
458    pos.limit += lenDelta;
459    pos.contextLimit += lenDelta;
460    // Restrict new value of start to [minOText, min(oText, pos.limit)].
461    pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
462    return U_MATCH;
463}
464
465/**
466 * Create a source string that represents this rule.  Append it to the
467 * given string.
468 */
469UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
470                                           UBool escapeUnprintable) const {
471
472    // Accumulate special characters (and non-specials following them)
473    // into quoteBuf.  Append quoteBuf, within single quotes, when
474    // a non-quoted element must be inserted.
475    UnicodeString str, quoteBuf;
476
477    // Do not emit the braces '{' '}' around the pattern if there
478    // is neither anteContext nor postContext.
479    UBool emitBraces =
480        (anteContext != NULL) || (postContext != NULL);
481
482    // Emit start anchor
483    if ((flags & ANCHOR_START) != 0) {
484        rule.append((UChar)94/*^*/);
485    }
486
487    // Emit the input pattern
488    ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
489
490    if (emitBraces) {
491        ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
492    }
493
494    ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);
495
496    if (emitBraces) {
497        ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
498    }
499
500    ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
501
502    // Emit end anchor
503    if ((flags & ANCHOR_END) != 0) {
504        rule.append((UChar)36/*$*/);
505    }
506
507    ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf);
508
509    // Emit the output pattern
510
511    ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),
512                              TRUE, escapeUnprintable, quoteBuf);
513
514    ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
515
516    return rule;
517}
518
519void TransliterationRule::setData(const TransliterationRuleData* d) {
520    data = d;
521    if (anteContext != NULL) anteContext->setData(d);
522    if (postContext != NULL) postContext->setData(d);
523    if (key != NULL) key->setData(d);
524    // assert(output != NULL);
525    output->setData(d);
526    // Don't have to do segments since they are in the context or key
527}
528
529/**
530 * Union the set of all characters that may be modified by this rule
531 * into the given set.
532 */
533void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
534    int32_t limit = anteContextLength + keyLength;
535    for (int32_t i=anteContextLength; i<limit; ) {
536        UChar32 ch = pattern.char32At(i);
537        i += U16_LENGTH(ch);
538        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
539        if (matcher == NULL) {
540            toUnionTo.add(ch);
541        } else {
542            matcher->addMatchSetTo(toUnionTo);
543        }
544    }
545}
546
547/**
548 * Union the set of all characters that may be emitted by this rule
549 * into the given set.
550 */
551void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {
552    output->toReplacer()->addReplacementSetTo(toUnionTo);
553}
554
555U_NAMESPACE_END
556
557#endif /* #if !UCONFIG_NO_TRANSLITERATION */
558
559//eof
560