1/*
2 **********************************************************************
3 *   Copyright (c) 2001-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/19/2001  aliu        Creation.
8 **********************************************************************
9 */
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uchar.h"
16#include "unicode/utf16.h"
17#include "unesctrn.h"
18#include "util.h"
19
20#include "cmemory.h"
21
22U_NAMESPACE_BEGIN
23
24/**
25 * Special character marking the end of the spec[] array.
26 */
27static const UChar END = 0xFFFF;
28
29// Unicode: "U+10FFFF" hex, min=4, max=6
30static const UChar SPEC_Unicode[] = {
31    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
32    END
33};
34
35// Java: "\\uFFFF" hex, min=4, max=4
36static const UChar SPEC_Java[] = {
37    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
38    END
39};
40
41// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
42static const UChar SPEC_C[] = {
43    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
44    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
45    END
46};
47
48// XML: "" hex, min=1, max=6
49static const UChar SPEC_XML[] = {
50    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
51    END
52};
53
54// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
55static const UChar SPEC_XML10[] = {
56    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
57    END
58};
59
60// Perl: "\\x{263A}" hex, min=1, max=6
61static const UChar SPEC_Perl[] = {
62    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
63    END
64};
65
66// All: Java, C, Perl, XML, XML10, Unicode
67static const UChar SPEC_Any[] = {
68    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
69    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
70    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
71    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
72    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
73    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
74    END
75};
76
77UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
78
79static UChar* copySpec(const UChar* spec) {
80    int32_t len = 0;
81    while (spec[len] != END) {
82        ++len;
83    }
84    ++len;
85    UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
86    // Check for memory allocation error.
87    if (result != NULL) {
88    	uprv_memcpy(result, spec, len*sizeof(result[0]));
89    }
90    return result;
91}
92
93/**
94 * Factory methods.  Ignore the context.
95 */
96static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
97    return new UnescapeTransliterator(ID, SPEC_Unicode);
98}
99static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
100    return new UnescapeTransliterator(ID, SPEC_Java);
101}
102static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
103    return new UnescapeTransliterator(ID, SPEC_C);
104}
105static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
106    return new UnescapeTransliterator(ID, SPEC_XML);
107}
108static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
109    return new UnescapeTransliterator(ID, SPEC_XML10);
110}
111static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
112    return new UnescapeTransliterator(ID, SPEC_Perl);
113}
114static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
115    return new UnescapeTransliterator(ID, SPEC_Any);
116}
117
118/**
119 * Registers standard variants with the system.  Called by
120 * Transliterator during initialization.
121 */
122void UnescapeTransliterator::registerIDs() {
123    Token t = integerToken(0);
124
125    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
126
127    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
128
129    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
130
131    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
132
133    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
134
135    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
136
137    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
138}
139
140/**
141 * Constructor.  Takes the encoded spec array.
142 */
143UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
144                                               const UChar *newSpec) :
145    Transliterator(newID, NULL)
146{
147    this->spec = copySpec(newSpec);
148}
149
150/**
151 * Copy constructor.
152 */
153UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
154    Transliterator(o) {
155    this->spec = copySpec(o.spec);
156}
157
158UnescapeTransliterator::~UnescapeTransliterator() {
159    uprv_free(spec);
160}
161
162/**
163 * Transliterator API.
164 */
165Transliterator* UnescapeTransliterator::clone() const {
166    return new UnescapeTransliterator(*this);
167}
168
169/**
170 * Implements {@link Transliterator#handleTransliterate}.
171 */
172void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
173                                                 UBool isIncremental) const {
174    int32_t start = pos.start;
175    int32_t limit = pos.limit;
176    int32_t i, j, ipat;
177
178    while (start < limit) {
179        // Loop over the forms in spec[].  Exit this loop when we
180        // match one of the specs.  Exit the outer loop if a
181        // partial match is detected and isIncremental is true.
182        for (j=0, ipat=0; spec[ipat] != END; ++j) {
183
184            // Read the header
185            int32_t prefixLen = spec[ipat++];
186            int32_t suffixLen = spec[ipat++];
187            int8_t  radix     = (int8_t) spec[ipat++];
188            int32_t minDigits = spec[ipat++];
189            int32_t maxDigits = spec[ipat++];
190
191            // s is a copy of start that is advanced over the
192            // characters as we parse them.
193            int32_t s = start;
194            UBool match = TRUE;
195
196            for (i=0; i<prefixLen; ++i) {
197                if (s >= limit) {
198                    if (i > 0) {
199                        // We've already matched a character.  This is
200                        // a partial match, so we return if in
201                        // incremental mode.  In non-incremental mode,
202                        // go to the next spec.
203                        if (isIncremental) {
204                            goto exit;
205                        }
206                        match = FALSE;
207                        break;
208                    }
209                }
210                UChar c = text.charAt(s++);
211                if (c != spec[ipat + i]) {
212                    match = FALSE;
213                    break;
214                }
215            }
216
217            if (match) {
218                UChar32 u = 0;
219                int32_t digitCount = 0;
220                for (;;) {
221                    if (s >= limit) {
222                        // Check for partial match in incremental mode.
223                        if (s > start && isIncremental) {
224                            goto exit;
225                        }
226                        break;
227                    }
228                    UChar32 ch = text.char32At(s);
229                    int32_t digit = u_digit(ch, radix);
230                    if (digit < 0) {
231                        break;
232                    }
233                    s += U16_LENGTH(ch);
234                    u = (u * radix) + digit;
235                    if (++digitCount == maxDigits) {
236                        break;
237                    }
238                }
239
240                match = (digitCount >= minDigits);
241
242                if (match) {
243                    for (i=0; i<suffixLen; ++i) {
244                        if (s >= limit) {
245                            // Check for partial match in incremental mode.
246                            if (s > start && isIncremental) {
247                                goto exit;
248                            }
249                            match = FALSE;
250                            break;
251                        }
252                        UChar c = text.charAt(s++);
253                        if (c != spec[ipat + prefixLen + i]) {
254                            match = FALSE;
255                            break;
256                        }
257                    }
258
259                    if (match) {
260                        // At this point, we have a match
261                        UnicodeString str(u);
262                        text.handleReplaceBetween(start, s, str);
263                        limit -= s - start - str.length();
264                        // The following break statement leaves the
265                        // loop that is traversing the forms in
266                        // spec[].  We then parse the next input
267                        // character.
268                        break;
269                    }
270                }
271            }
272
273            ipat += prefixLen + suffixLen;
274        }
275
276        if (start < limit) {
277            start += U16_LENGTH(text.char32At(start));
278        }
279    }
280
281  exit:
282    pos.contextLimit += limit - pos.limit;
283    pos.limit = limit;
284    pos.start = start;
285}
286
287U_NAMESPACE_END
288
289#endif /* #if !UCONFIG_NO_TRANSLITERATION */
290
291//eof
292