1/*
2**********************************************************************
3*   Copyright (c) 2002-2012, International Business Machines Corporation
4*   and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   01/21/2002  aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uniset.h"
16#include "unicode/utf16.h"
17#include "strrepl.h"
18#include "rbt_data.h"
19#include "util.h"
20
21U_NAMESPACE_BEGIN
22
23UnicodeReplacer::~UnicodeReplacer() {}
24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
25
26/**
27 * Construct a StringReplacer that sets the emits the given output
28 * text and sets the cursor to the given position.
29 * @param theOutput text that will replace input text when the
30 * replace() method is called.  May contain stand-in characters
31 * that represent nested replacers.
32 * @param theCursorPos cursor position that will be returned by
33 * the replace() method
34 * @param theData transliterator context object that translates
35 * stand-in characters to UnicodeReplacer objects
36 */
37StringReplacer::StringReplacer(const UnicodeString& theOutput,
38                               int32_t theCursorPos,
39                               const TransliterationRuleData* theData) {
40    output = theOutput;
41    cursorPos = theCursorPos;
42    hasCursor = TRUE;
43    data = theData;
44    isComplex = TRUE;
45}
46
47/**
48 * Construct a StringReplacer that sets the emits the given output
49 * text and does not modify the cursor.
50 * @param theOutput text that will replace input text when the
51 * replace() method is called.  May contain stand-in characters
52 * that represent nested replacers.
53 * @param theData transliterator context object that translates
54 * stand-in characters to UnicodeReplacer objects
55 */
56StringReplacer::StringReplacer(const UnicodeString& theOutput,
57                               const TransliterationRuleData* theData) {
58    output = theOutput;
59    cursorPos = 0;
60    hasCursor = FALSE;
61    data = theData;
62    isComplex = TRUE;
63}
64
65/**
66 * Copy constructor.
67 */
68StringReplacer::StringReplacer(const StringReplacer& other) :
69    UnicodeFunctor(other),
70    UnicodeReplacer(other)
71{
72    output = other.output;
73    cursorPos = other.cursorPos;
74    hasCursor = other.hasCursor;
75    data = other.data;
76    isComplex = other.isComplex;
77}
78
79/**
80 * Destructor
81 */
82StringReplacer::~StringReplacer() {
83}
84
85/**
86 * Implement UnicodeFunctor
87 */
88UnicodeFunctor* StringReplacer::clone() const {
89    return new StringReplacer(*this);
90}
91
92/**
93 * Implement UnicodeFunctor
94 */
95UnicodeReplacer* StringReplacer::toReplacer() const {
96  return const_cast<StringReplacer *>(this);
97}
98
99/**
100 * UnicodeReplacer API
101 */
102int32_t StringReplacer::replace(Replaceable& text,
103                                int32_t start,
104                                int32_t limit,
105                                int32_t& cursor) {
106    int32_t outLen;
107    int32_t newStart = 0;
108
109    // NOTE: It should be possible to _always_ run the complex
110    // processing code; just slower.  If not, then there is a bug
111    // in the complex processing code.
112
113    // Simple (no nested replacers) Processing Code :
114    if (!isComplex) {
115        text.handleReplaceBetween(start, limit, output);
116        outLen = output.length();
117
118        // Setup default cursor position (for cursorPos within output)
119        newStart = cursorPos;
120    }
121
122    // Complex (nested replacers) Processing Code :
123    else {
124        /* When there are segments to be copied, use the Replaceable.copy()
125         * API in order to retain out-of-band data.  Copy everything to the
126         * end of the string, then copy them back over the key.  This preserves
127         * the integrity of indices into the key and surrounding context while
128         * generating the output text.
129         */
130        UnicodeString buf;
131        int32_t oOutput; // offset into 'output'
132        isComplex = FALSE;
133
134        // The temporary buffer starts at tempStart, and extends
135        // to destLimit.  The start of the buffer has a single
136        // character from before the key.  This provides style
137        // data when addition characters are filled into the
138        // temporary buffer.  If there is nothing to the left, use
139        // the non-character U+FFFF, which Replaceable subclasses
140        // should treat specially as a "no-style character."
141        // destStart points to the point after the style context
142        // character, so it is tempStart+1 or tempStart+2.
143        int32_t tempStart = text.length(); // start of temp buffer
144        int32_t destStart = tempStart; // copy new text to here
145        if (start > 0) {
146            int32_t len = U16_LENGTH(text.char32At(start-1));
147            text.copy(start-len, start, tempStart);
148            destStart += len;
149        } else {
150            UnicodeString str((UChar) 0xFFFF);
151            text.handleReplaceBetween(tempStart, tempStart, str);
152            destStart++;
153        }
154        int32_t destLimit = destStart;
155
156        for (oOutput=0; oOutput<output.length(); ) {
157            if (oOutput == cursorPos) {
158                // Record the position of the cursor
159                newStart = destLimit - destStart; // relative to start
160            }
161            UChar32 c = output.char32At(oOutput);
162            UnicodeReplacer* r = data->lookupReplacer(c);
163            if (r == NULL) {
164                // Accumulate straight (non-segment) text.
165                buf.append(c);
166            } else {
167                isComplex = TRUE;
168
169                // Insert any accumulated straight text.
170                if (buf.length() > 0) {
171                    text.handleReplaceBetween(destLimit, destLimit, buf);
172                    destLimit += buf.length();
173                    buf.truncate(0);
174                }
175
176                // Delegate output generation to replacer object
177                int32_t len = r->replace(text, destLimit, destLimit, cursor);
178                destLimit += len;
179            }
180            oOutput += U16_LENGTH(c);
181        }
182        // Insert any accumulated straight text.
183        if (buf.length() > 0) {
184            text.handleReplaceBetween(destLimit, destLimit, buf);
185            destLimit += buf.length();
186        }
187        if (oOutput == cursorPos) {
188            // Record the position of the cursor
189            newStart = destLimit - destStart; // relative to start
190        }
191
192        outLen = destLimit - destStart;
193
194        // Copy new text to start, and delete it
195        text.copy(destStart, destLimit, start);
196        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
197
198        // Delete the old text (the key)
199        text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
200    }
201
202    if (hasCursor) {
203        // Adjust the cursor for positions outside the key.  These
204        // refer to code points rather than code units.  If cursorPos
205        // is within the output string, then use newStart, which has
206        // already been set above.
207        if (cursorPos < 0) {
208            newStart = start;
209            int32_t n = cursorPos;
210            // Outside the output string, cursorPos counts code points
211            while (n < 0 && newStart > 0) {
212                newStart -= U16_LENGTH(text.char32At(newStart-1));
213                ++n;
214            }
215            newStart += n;
216        } else if (cursorPos > output.length()) {
217            newStart = start + outLen;
218            int32_t n = cursorPos - output.length();
219            // Outside the output string, cursorPos counts code points
220            while (n > 0 && newStart < text.length()) {
221                newStart += U16_LENGTH(text.char32At(newStart));
222                --n;
223            }
224            newStart += n;
225        } else {
226            // Cursor is within output string.  It has been set up above
227            // to be relative to start.
228            newStart += start;
229        }
230
231        cursor = newStart;
232    }
233
234    return outLen;
235}
236
237/**
238 * UnicodeReplacer API
239 */
240UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
241                                                 UBool escapeUnprintable) const {
242    rule.truncate(0);
243    UnicodeString quoteBuf;
244
245    int32_t cursor = cursorPos;
246
247    // Handle a cursor preceding the output
248    if (hasCursor && cursor < 0) {
249        while (cursor++ < 0) {
250            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
251        }
252        // Fall through and append '|' below
253    }
254
255    for (int32_t i=0; i<output.length(); ++i) {
256        if (hasCursor && i == cursor) {
257            ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
258        }
259        UChar c = output.charAt(i); // Ok to use 16-bits here
260
261        UnicodeReplacer* r = data->lookupReplacer(c);
262        if (r == NULL) {
263            ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
264        } else {
265            UnicodeString buf;
266            r->toReplacerPattern(buf, escapeUnprintable);
267            buf.insert(0, (UChar)0x20);
268            buf.append((UChar)0x20);
269            ICU_Utility::appendToRule(rule, buf,
270                                      TRUE, escapeUnprintable, quoteBuf);
271        }
272    }
273
274    // Handle a cursor after the output.  Use > rather than >= because
275    // if cursor == output.length() it is at the end of the output,
276    // which is the default position, so we need not emit it.
277    if (hasCursor && cursor > output.length()) {
278        cursor -= output.length();
279        while (cursor-- > 0) {
280            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
281        }
282        ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
283    }
284    // Flush quoteBuf out to result
285    ICU_Utility::appendToRule(rule, -1,
286                              TRUE, escapeUnprintable, quoteBuf);
287
288    return rule;
289}
290
291/**
292 * Implement UnicodeReplacer
293 */
294void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
295    UChar32 ch;
296    for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
297    ch = output.char32At(i);
298    UnicodeReplacer* r = data->lookupReplacer(ch);
299    if (r == NULL) {
300        toUnionTo.add(ch);
301    } else {
302        r->addReplacementSetTo(toUnionTo);
303    }
304    }
305}
306
307/**
308 * UnicodeFunctor API
309 */
310void StringReplacer::setData(const TransliterationRuleData* d) {
311    data = d;
312    int32_t i = 0;
313    while (i<output.length()) {
314        UChar32 c = output.char32At(i);
315        UnicodeFunctor* f = data->lookup(c);
316        if (f != NULL) {
317            f->setData(data);
318        }
319        i += U16_LENGTH(c);
320    }
321}
322
323U_NAMESPACE_END
324
325#endif /* #if !UCONFIG_NO_TRANSLITERATION */
326
327//eof
328