1/*
2**********************************************************************
3*   Copyright (c) 2002-2004, International Business Machines Corporation
4*   and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   01/21/2002  aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "strrepl.h"
16#include "rbt_data.h"
17#include "util.h"
18#include "unicode/uniset.h"
19
20U_NAMESPACE_BEGIN
21
22static const UChar EMPTY[] = { 0 }; // empty string: ""
23
24UnicodeReplacer::~UnicodeReplacer() {}
25UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
26
27/**
28 * Construct a StringReplacer that sets the emits the given output
29 * text and sets the cursor to the given position.
30 * @param theOutput text that will replace input text when the
31 * replace() method is called.  May contain stand-in characters
32 * that represent nested replacers.
33 * @param theCursorPos cursor position that will be returned by
34 * the replace() method
35 * @param theData transliterator context object that translates
36 * stand-in characters to UnicodeReplacer objects
37 */
38StringReplacer::StringReplacer(const UnicodeString& theOutput,
39                               int32_t theCursorPos,
40                               const TransliterationRuleData* theData) {
41    output = theOutput;
42    cursorPos = theCursorPos;
43    hasCursor = TRUE;
44    data = theData;
45    isComplex = TRUE;
46}
47
48/**
49 * Construct a StringReplacer that sets the emits the given output
50 * text and does not modify the cursor.
51 * @param theOutput text that will replace input text when the
52 * replace() method is called.  May contain stand-in characters
53 * that represent nested replacers.
54 * @param theData transliterator context object that translates
55 * stand-in characters to UnicodeReplacer objects
56 */
57StringReplacer::StringReplacer(const UnicodeString& theOutput,
58                               const TransliterationRuleData* theData) {
59    output = theOutput;
60    cursorPos = 0;
61    hasCursor = FALSE;
62    data = theData;
63    isComplex = TRUE;
64}
65
66/**
67 * Copy constructor.
68 */
69StringReplacer::StringReplacer(const StringReplacer& other) :
70    UnicodeFunctor(other),
71    UnicodeReplacer(other)
72{
73    output = other.output;
74    cursorPos = other.cursorPos;
75    hasCursor = other.hasCursor;
76    data = other.data;
77    isComplex = other.isComplex;
78}
79
80/**
81 * Destructor
82 */
83StringReplacer::~StringReplacer() {
84}
85
86/**
87 * Implement UnicodeFunctor
88 */
89UnicodeFunctor* StringReplacer::clone() const {
90    return new StringReplacer(*this);
91}
92
93/**
94 * Implement UnicodeFunctor
95 */
96UnicodeReplacer* StringReplacer::toReplacer() const {
97    return (UnicodeReplacer*) this;
98}
99
100/**
101 * UnicodeReplacer API
102 */
103int32_t StringReplacer::replace(Replaceable& text,
104                                int32_t start,
105                                int32_t limit,
106                                int32_t& cursor) {
107    int32_t outLen;
108    int32_t newStart = 0;
109
110    // NOTE: It should be possible to _always_ run the complex
111    // processing code; just slower.  If not, then there is a bug
112    // in the complex processing code.
113
114    // Simple (no nested replacers) Processing Code :
115    if (!isComplex) {
116        text.handleReplaceBetween(start, limit, output);
117        outLen = output.length();
118
119        // Setup default cursor position (for cursorPos within output)
120        newStart = cursorPos;
121    }
122
123    // Complex (nested replacers) Processing Code :
124    else {
125        /* When there are segments to be copied, use the Replaceable.copy()
126         * API in order to retain out-of-band data.  Copy everything to the
127         * end of the string, then copy them back over the key.  This preserves
128         * the integrity of indices into the key and surrounding context while
129         * generating the output text.
130         */
131        UnicodeString buf;
132        int32_t oOutput; // offset into 'output'
133        isComplex = FALSE;
134
135        // The temporary buffer starts at tempStart, and extends
136        // to destLimit.  The start of the buffer has a single
137        // character from before the key.  This provides style
138        // data when addition characters are filled into the
139        // temporary buffer.  If there is nothing to the left, use
140        // the non-character U+FFFF, which Replaceable subclasses
141        // should treat specially as a "no-style character."
142        // destStart points to the point after the style context
143        // character, so it is tempStart+1 or tempStart+2.
144        int32_t tempStart = text.length(); // start of temp buffer
145        int32_t destStart = tempStart; // copy new text to here
146        if (start > 0) {
147            int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
148            text.copy(start-len, start, tempStart);
149            destStart += len;
150        } else {
151            UnicodeString str((UChar) 0xFFFF);
152            text.handleReplaceBetween(tempStart, tempStart, str);
153            destStart++;
154        }
155        int32_t destLimit = destStart;
156
157        for (oOutput=0; oOutput<output.length(); ) {
158            if (oOutput == cursorPos) {
159                // Record the position of the cursor
160                newStart = destLimit - destStart; // relative to start
161            }
162            UChar32 c = output.char32At(oOutput);
163            UnicodeReplacer* r = data->lookupReplacer(c);
164            if (r == NULL) {
165                // Accumulate straight (non-segment) text.
166                buf.append(c);
167            } else {
168                isComplex = TRUE;
169
170                // Insert any accumulated straight text.
171                if (buf.length() > 0) {
172                    text.handleReplaceBetween(destLimit, destLimit, buf);
173                    destLimit += buf.length();
174                    buf.truncate(0);
175                }
176
177                // Delegate output generation to replacer object
178                int32_t len = r->replace(text, destLimit, destLimit, cursor);
179                destLimit += len;
180            }
181            oOutput += UTF_CHAR_LENGTH(c);
182        }
183        // Insert any accumulated straight text.
184        if (buf.length() > 0) {
185            text.handleReplaceBetween(destLimit, destLimit, buf);
186            destLimit += buf.length();
187        }
188        if (oOutput == cursorPos) {
189            // Record the position of the cursor
190            newStart = destLimit - destStart; // relative to start
191        }
192
193        outLen = destLimit - destStart;
194
195        // Copy new text to start, and delete it
196        text.copy(destStart, destLimit, start);
197        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);
198
199        // Delete the old text (the key)
200        text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
201    }
202
203    if (hasCursor) {
204        // Adjust the cursor for positions outside the key.  These
205        // refer to code points rather than code units.  If cursorPos
206        // is within the output string, then use newStart, which has
207        // already been set above.
208        if (cursorPos < 0) {
209            newStart = start;
210            int32_t n = cursorPos;
211            // Outside the output string, cursorPos counts code points
212            while (n < 0 && newStart > 0) {
213                newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
214                ++n;
215            }
216            newStart += n;
217        } else if (cursorPos > output.length()) {
218            newStart = start + outLen;
219            int32_t n = cursorPos - output.length();
220            // Outside the output string, cursorPos counts code points
221            while (n > 0 && newStart < text.length()) {
222                newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
223                --n;
224            }
225            newStart += n;
226        } else {
227            // Cursor is within output string.  It has been set up above
228            // to be relative to start.
229            newStart += start;
230        }
231
232        cursor = newStart;
233    }
234
235    return outLen;
236}
237
238/**
239 * UnicodeReplacer API
240 */
241UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
242                                                 UBool escapeUnprintable) const {
243    rule.truncate(0);
244    UnicodeString quoteBuf;
245
246    int32_t cursor = cursorPos;
247
248    // Handle a cursor preceding the output
249    if (hasCursor && cursor < 0) {
250        while (cursor++ < 0) {
251            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
252        }
253        // Fall through and append '|' below
254    }
255
256    for (int32_t i=0; i<output.length(); ++i) {
257        if (hasCursor && i == cursor) {
258            ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
259        }
260        UChar c = output.charAt(i); // Ok to use 16-bits here
261
262        UnicodeReplacer* r = data->lookupReplacer(c);
263        if (r == NULL) {
264            ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
265        } else {
266            UnicodeString buf;
267            r->toReplacerPattern(buf, escapeUnprintable);
268            buf.insert(0, (UChar)0x20);
269            buf.append((UChar)0x20);
270            ICU_Utility::appendToRule(rule, buf,
271                                      TRUE, escapeUnprintable, quoteBuf);
272        }
273    }
274
275    // Handle a cursor after the output.  Use > rather than >= because
276    // if cursor == output.length() it is at the end of the output,
277    // which is the default position, so we need not emit it.
278    if (hasCursor && cursor > output.length()) {
279        cursor -= output.length();
280        while (cursor-- > 0) {
281            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
282        }
283        ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
284    }
285    // Flush quoteBuf out to result
286    ICU_Utility::appendToRule(rule, -1,
287                              TRUE, escapeUnprintable, quoteBuf);
288
289    return rule;
290}
291
292/**
293 * Implement UnicodeReplacer
294 */
295void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
296    UChar32 ch;
297    for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
298    ch = output.char32At(i);
299    UnicodeReplacer* r = data->lookupReplacer(ch);
300    if (r == NULL) {
301        toUnionTo.add(ch);
302    } else {
303        r->addReplacementSetTo(toUnionTo);
304    }
305    }
306}
307
308/**
309 * UnicodeFunctor API
310 */
311void StringReplacer::setData(const TransliterationRuleData* d) {
312    data = d;
313    int32_t i = 0;
314    while (i<output.length()) {
315        UChar32 c = output.char32At(i);
316        UnicodeFunctor* f = data->lookup(c);
317        if (f != NULL) {
318            f->setData(data);
319        }
320        i += UTF_CHAR_LENGTH(c);
321    }
322}
323
324U_NAMESPACE_END
325
326#endif /* #if !UCONFIG_NO_TRANSLITERATION */
327
328//eof
329