1/*
2 **********************************************************************
3 *   Copyright (c) 2001-2008, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/19/2001  aliu        Creation.
8 **********************************************************************
9 */
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uchar.h"
16#include "unesctrn.h"
17#include "util.h"
18
19#include "cmemory.h"
20
21U_NAMESPACE_BEGIN
22
23/**
24 * Special character marking the end of the spec[] array.
25 */
26static const UChar END = 0xFFFF;
27
28// Unicode: "U+10FFFF" hex, min=4, max=6
29static const UChar SPEC_Unicode[] = {
30    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
31    END
32};
33
34// Java: "\\uFFFF" hex, min=4, max=4
35static const UChar SPEC_Java[] = {
36    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
37    END
38};
39
40// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41static const UChar SPEC_C[] = {
42    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
44    END
45};
46
47// XML: "" hex, min=1, max=6
48static const UChar SPEC_XML[] = {
49    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
50    END
51};
52
53// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54static const UChar SPEC_XML10[] = {
55    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
56    END
57};
58
59// Perl: "\\x{263A}" hex, min=1, max=6
60static const UChar SPEC_Perl[] = {
61    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
62    END
63};
64
65// All: Java, C, Perl, XML, XML10, Unicode
66static const UChar SPEC_Any[] = {
67    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
68    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
69    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
70    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
71    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
72    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
73    END
74};
75
76UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
77
78static UChar* copySpec(const UChar* spec) {
79    int32_t len = 0;
80    while (spec[len] != END) {
81        ++len;
82    }
83    ++len;
84    UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
85    // Check for memory allocation error.
86    if (result != NULL) {
87    	uprv_memcpy(result, spec, len*sizeof(result[0]));
88    }
89    return result;
90}
91
92/**
93 * Factory methods.  Ignore the context.
94 */
95static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
96    return new UnescapeTransliterator(ID, SPEC_Unicode);
97}
98static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
99    return new UnescapeTransliterator(ID, SPEC_Java);
100}
101static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
102    return new UnescapeTransliterator(ID, SPEC_C);
103}
104static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
105    return new UnescapeTransliterator(ID, SPEC_XML);
106}
107static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
108    return new UnescapeTransliterator(ID, SPEC_XML10);
109}
110static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
111    return new UnescapeTransliterator(ID, SPEC_Perl);
112}
113static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
114    return new UnescapeTransliterator(ID, SPEC_Any);
115}
116
117/**
118 * Registers standard variants with the system.  Called by
119 * Transliterator during initialization.
120 */
121void UnescapeTransliterator::registerIDs() {
122    Token t = integerToken(0);
123
124    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
125
126    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
127
128    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
129
130    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
131
132    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
133
134    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
135
136    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
137}
138
139/**
140 * Constructor.  Takes the encoded spec array.
141 */
142UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
143                                               const UChar *newSpec) :
144    Transliterator(newID, NULL)
145{
146    this->spec = copySpec(newSpec);
147}
148
149/**
150 * Copy constructor.
151 */
152UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
153    Transliterator(o) {
154    this->spec = copySpec(o.spec);
155}
156
157UnescapeTransliterator::~UnescapeTransliterator() {
158    uprv_free(spec);
159}
160
161/**
162 * Transliterator API.
163 */
164Transliterator* UnescapeTransliterator::clone() const {
165    return new UnescapeTransliterator(*this);
166}
167
168/**
169 * Implements {@link Transliterator#handleTransliterate}.
170 */
171void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
172                                                 UBool isIncremental) const {
173    int32_t start = pos.start;
174    int32_t limit = pos.limit;
175    int32_t i, j, ipat;
176
177    while (start < limit) {
178        // Loop over the forms in spec[].  Exit this loop when we
179        // match one of the specs.  Exit the outer loop if a
180        // partial match is detected and isIncremental is true.
181        for (j=0, ipat=0; spec[ipat] != END; ++j) {
182
183            // Read the header
184            int32_t prefixLen = spec[ipat++];
185            int32_t suffixLen = spec[ipat++];
186            int8_t  radix     = (int8_t) spec[ipat++];
187            int32_t minDigits = spec[ipat++];
188            int32_t maxDigits = spec[ipat++];
189
190            // s is a copy of start that is advanced over the
191            // characters as we parse them.
192            int32_t s = start;
193            UBool match = TRUE;
194
195            for (i=0; i<prefixLen; ++i) {
196                if (s >= limit) {
197                    if (i > 0) {
198                        // We've already matched a character.  This is
199                        // a partial match, so we return if in
200                        // incremental mode.  In non-incremental mode,
201                        // go to the next spec.
202                        if (isIncremental) {
203                            goto exit;
204                        }
205                        match = FALSE;
206                        break;
207                    }
208                }
209                UChar c = text.charAt(s++);
210                if (c != spec[ipat + i]) {
211                    match = FALSE;
212                    break;
213                }
214            }
215
216            if (match) {
217                UChar32 u = 0;
218                int32_t digitCount = 0;
219                for (;;) {
220                    if (s >= limit) {
221                        // Check for partial match in incremental mode.
222                        if (s > start && isIncremental) {
223                            goto exit;
224                        }
225                        break;
226                    }
227                    UChar32 ch = text.char32At(s);
228                    int32_t digit = u_digit(ch, radix);
229                    if (digit < 0) {
230                        break;
231                    }
232                    s += UTF_CHAR_LENGTH(ch);
233                    u = (u * radix) + digit;
234                    if (++digitCount == maxDigits) {
235                        break;
236                    }
237                }
238
239                match = (digitCount >= minDigits);
240
241                if (match) {
242                    for (i=0; i<suffixLen; ++i) {
243                        if (s >= limit) {
244                            // Check for partial match in incremental mode.
245                            if (s > start && isIncremental) {
246                                goto exit;
247                            }
248                            match = FALSE;
249                            break;
250                        }
251                        UChar c = text.charAt(s++);
252                        if (c != spec[ipat + prefixLen + i]) {
253                            match = FALSE;
254                            break;
255                        }
256                    }
257
258                    if (match) {
259                        // At this point, we have a match
260                        UnicodeString str(u);
261                        text.handleReplaceBetween(start, s, str);
262                        limit -= s - start - str.length();
263                        // The following break statement leaves the
264                        // loop that is traversing the forms in
265                        // spec[].  We then parse the next input
266                        // character.
267                        break;
268                    }
269                }
270            }
271
272            ipat += prefixLen + suffixLen;
273        }
274
275        if (start < limit) {
276            start += UTF_CHAR_LENGTH(text.char32At(start));
277        }
278    }
279
280  exit:
281    pos.contextLimit += limit - pos.limit;
282    pos.limit = limit;
283    pos.start = start;
284}
285
286U_NAMESPACE_END
287
288#endif /* #if !UCONFIG_NO_TRANSLITERATION */
289
290//eof
291