1/*
2**********************************************************************
3*   Copyright (C) 2001-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   06/07/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/unifilt.h"
16#include "unicode/uchar.h"
17#include "unicode/uniset.h"
18#include "unicode/utf16.h"
19#include "cmemory.h"
20#include "name2uni.h"
21#include "patternprops.h"
22#include "uprops.h"
23#include "uinvchar.h"
24#include "util.h"
25
26U_NAMESPACE_BEGIN
27
28UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
29
30static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
31static const UChar OPEN_DELIM  = 92;  // '\\' first char of OPEN
32static const UChar CLOSE_DELIM = 125; // '}'
33static const UChar SPACE       = 32;  // ' '
34
35U_CDECL_BEGIN
36
37// USetAdder implementation
38// Does not use uset.h to reduce code dependencies
39static void U_CALLCONV
40_set_add(USet *set, UChar32 c) {
41    uset_add(set, c);
42}
43
44// These functions aren't used.
45/*static void U_CALLCONV
46_set_addRange(USet *set, UChar32 start, UChar32 end) {
47    ((UnicodeSet *)set)->add(start, end);
48}
49
50static void U_CALLCONV
51_set_addString(USet *set, const UChar *str, int32_t length) {
52    ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
53}*/
54
55U_CDECL_END
56
57/**
58 * Constructs a transliterator with the default delimiters '{' and
59 * '}'.
60 */
61NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
62    Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
63
64    UnicodeSet *legalPtr = &legal;
65    // Get the legal character set
66    USetAdder sa = {
67        (USet *)legalPtr, // USet* == UnicodeSet*
68        _set_add,
69        NULL, // Don't need _set_addRange
70        NULL, // Don't need _set_addString
71        NULL, // Don't need remove()
72        NULL
73    };
74    uprv_getCharNameCharacters(&sa);
75}
76
77/**
78 * Destructor.
79 */
80NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
81
82/**
83 * Copy constructor.
84 */
85NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
86    Transliterator(o), legal(o.legal) {}
87
88/**
89 * Assignment operator.
90 */
91/*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
92                             const NameUnicodeTransliterator& o) {
93    Transliterator::operator=(o);
94    // not necessary: the legal sets should all be the same -- legal=o.legal;
95    return *this;
96}*/
97
98/**
99 * Transliterator API.
100 */
101Transliterator* NameUnicodeTransliterator::clone(void) const {
102    return new NameUnicodeTransliterator(*this);
103}
104
105/**
106 * Implements {@link Transliterator#handleTransliterate}.
107 */
108void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
109                                                    UBool isIncremental) const {
110    // The failure mode, here and below, is to behave like Any-Null,
111    // if either there is no name data (max len == 0) or there is no
112    // memory (malloc() => NULL).
113
114    int32_t maxLen = uprv_getMaxCharNameLength();
115    if (maxLen == 0) {
116        offsets.start = offsets.limit;
117        return;
118    }
119
120    // Accomodate the longest possible name
121    ++maxLen; // allow for temporary trailing space
122    char* cbuf = (char*) uprv_malloc(maxLen);
123    if (cbuf == NULL) {
124        offsets.start = offsets.limit;
125        return;
126    }
127
128    UnicodeString openPat(TRUE, OPEN, -1);
129    UnicodeString str, name;
130
131    int32_t cursor = offsets.start;
132    int32_t limit = offsets.limit;
133
134    // Modes:
135    // 0 - looking for open delimiter
136    // 1 - after open delimiter
137    int32_t mode = 0;
138    int32_t openPos = -1; // open delim candidate pos
139
140    UChar32 c;
141    while (cursor < limit) {
142        c = text.char32At(cursor);
143
144        switch (mode) {
145        case 0: // looking for open delimiter
146            if (c == OPEN_DELIM) { // quick check first
147                openPos = cursor;
148                int32_t i =
149                    ICU_Utility::parsePattern(openPat, text, cursor, limit);
150                if (i >= 0 && i < limit) {
151                    mode = 1;
152                    name.truncate(0);
153                    cursor = i;
154                    continue; // *** reprocess char32At(cursor)
155                }
156            }
157            break;
158
159        case 1: // after open delimiter
160            // Look for legal chars.  If \s+ is found, convert it
161            // to a single space.  If closeDelimiter is found, exit
162            // the loop.  If any other character is found, exit the
163            // loop.  If the limit is reached, exit the loop.
164
165            // Convert \s+ => SPACE.  This assumes there are no
166            // runs of >1 space characters in names.
167            if (PatternProps::isWhiteSpace(c)) {
168                // Ignore leading whitespace
169                if (name.length() > 0 &&
170                    name.charAt(name.length()-1) != SPACE) {
171                    name.append(SPACE);
172                    // If we are too long then abort.  maxLen includes
173                    // temporary trailing space, so use '>'.
174                    if (name.length() > maxLen) {
175                        mode = 0;
176                    }
177                }
178                break;
179            }
180
181            if (c == CLOSE_DELIM) {
182                int32_t len = name.length();
183
184                // Delete trailing space, if any
185                if (len > 0 &&
186                    name.charAt(len-1) == SPACE) {
187                    --len;
188                }
189
190                if (uprv_isInvariantUString(name.getBuffer(), len)) {
191                    name.extract(0, len, cbuf, maxLen, US_INV);
192
193                    UErrorCode status = U_ZERO_ERROR;
194                    c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
195                    if (U_SUCCESS(status)) {
196                        // Lookup succeeded
197
198                        // assert(U16_LENGTH(CLOSE_DELIM) == 1);
199                        cursor++; // advance over CLOSE_DELIM
200
201                        str.truncate(0);
202                        str.append(c);
203                        text.handleReplaceBetween(openPos, cursor, str);
204
205                        // Adjust indices for the change in the length of
206                        // the string.  Do not assume that str.length() ==
207                        // 1, in case of surrogates.
208                        int32_t delta = cursor - openPos - str.length();
209                        cursor -= delta;
210                        limit -= delta;
211                        // assert(cursor == openPos + str.length());
212                    }
213                }
214                // If the lookup failed, we leave things as-is and
215                // still switch to mode 0 and continue.
216                mode = 0;
217                openPos = -1; // close off candidate
218                continue; // *** reprocess char32At(cursor)
219            }
220
221            // Check if c is a legal char.  We assume here that
222            // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
223            // name, we don't have to go back to openPos+1.
224            if (legal.contains(c)) {
225                name.append(c);
226                // If we go past the longest possible name then abort.
227                // maxLen includes temporary trailing space, so use '>='.
228                if (name.length() >= maxLen) {
229                    mode = 0;
230                }
231            }
232
233            // Invalid character
234            else {
235                --cursor; // Backup and reprocess this character
236                mode = 0;
237            }
238
239            break;
240        }
241
242        cursor += U16_LENGTH(c);
243    }
244
245    offsets.contextLimit += limit - offsets.limit;
246    offsets.limit = limit;
247    // In incremental mode, only advance the cursor up to the last
248    // open delimiter candidate.
249    offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
250
251    uprv_free(cbuf);
252}
253
254U_NAMESPACE_END
255
256#endif /* #if !UCONFIG_NO_TRANSLITERATION */
257