1/*
2**********************************************************************
3*   Copyright (C) 2001-2008, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   06/07/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/unifilt.h"
16#include "unicode/uchar.h"
17#include "unicode/uniset.h"
18#include "name2uni.h"
19#include "cmemory.h"
20#include "uprops.h"
21#include "uinvchar.h"
22#include "util.h"
23
24U_NAMESPACE_BEGIN
25
26UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
27
28static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
29static const UChar OPEN_DELIM  = 92;  // '\\' first char of OPEN
30static const UChar CLOSE_DELIM = 125; // '}'
31static const UChar SPACE       = 32;  // ' '
32
33U_CDECL_BEGIN
34
35// USetAdder implementation
36// Does not use uset.h to reduce code dependencies
37static void U_CALLCONV
38_set_add(USet *set, UChar32 c) {
39    uset_add(set, c);
40}
41
42// These functions aren't used.
43/*static void U_CALLCONV
44_set_addRange(USet *set, UChar32 start, UChar32 end) {
45    ((UnicodeSet *)set)->add(start, end);
46}
47
48static void U_CALLCONV
49_set_addString(USet *set, const UChar *str, int32_t length) {
50    ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
51}*/
52
53U_CDECL_END
54
55/**
56 * Constructs a transliterator with the default delimiters '{' and
57 * '}'.
58 */
59NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
60    Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
61
62    UnicodeSet *legalPtr = &legal;
63    // Get the legal character set
64    USetAdder sa = {
65        (USet *)legalPtr, // USet* == UnicodeSet*
66        _set_add,
67        NULL, // Don't need _set_addRange
68        NULL, // Don't need _set_addString
69        NULL, // Don't need remove()
70        NULL
71    };
72    uprv_getCharNameCharacters(&sa);
73}
74
75/**
76 * Destructor.
77 */
78NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
79
80/**
81 * Copy constructor.
82 */
83NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
84    Transliterator(o), legal(o.legal) {}
85
86/**
87 * Assignment operator.
88 */
89/*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
90                             const NameUnicodeTransliterator& o) {
91    Transliterator::operator=(o);
92    // not necessary: the legal sets should all be the same -- legal=o.legal;
93    return *this;
94}*/
95
96/**
97 * Transliterator API.
98 */
99Transliterator* NameUnicodeTransliterator::clone(void) const {
100    return new NameUnicodeTransliterator(*this);
101}
102
103/**
104 * Implements {@link Transliterator#handleTransliterate}.
105 */
106void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
107                                                    UBool isIncremental) const {
108    // The failure mode, here and below, is to behave like Any-Null,
109    // if either there is no name data (max len == 0) or there is no
110    // memory (malloc() => NULL).
111
112    int32_t maxLen = uprv_getMaxCharNameLength();
113    if (maxLen == 0) {
114        offsets.start = offsets.limit;
115        return;
116    }
117
118    // Accomodate the longest possible name
119    ++maxLen; // allow for temporary trailing space
120    char* cbuf = (char*) uprv_malloc(maxLen);
121    if (cbuf == NULL) {
122        offsets.start = offsets.limit;
123        return;
124    }
125
126    UnicodeString openPat(TRUE, OPEN, -1);
127    UnicodeString str, name;
128
129    int32_t cursor = offsets.start;
130    int32_t limit = offsets.limit;
131
132    // Modes:
133    // 0 - looking for open delimiter
134    // 1 - after open delimiter
135    int32_t mode = 0;
136    int32_t openPos = -1; // open delim candidate pos
137
138    UChar32 c;
139    while (cursor < limit) {
140        c = text.char32At(cursor);
141
142        switch (mode) {
143        case 0: // looking for open delimiter
144            if (c == OPEN_DELIM) { // quick check first
145                openPos = cursor;
146                int32_t i =
147                    ICU_Utility::parsePattern(openPat, text, cursor, limit);
148                if (i >= 0 && i < limit) {
149                    mode = 1;
150                    name.truncate(0);
151                    cursor = i;
152                    continue; // *** reprocess char32At(cursor)
153                }
154            }
155            break;
156
157        case 1: // after open delimiter
158            // Look for legal chars.  If \s+ is found, convert it
159            // to a single space.  If closeDelimiter is found, exit
160            // the loop.  If any other character is found, exit the
161            // loop.  If the limit is reached, exit the loop.
162
163            // Convert \s+ => SPACE.  This assumes there are no
164            // runs of >1 space characters in names.
165            if (uprv_isRuleWhiteSpace(c)) {
166                // Ignore leading whitespace
167                if (name.length() > 0 &&
168                    name.charAt(name.length()-1) != SPACE) {
169                    name.append(SPACE);
170                    // If we are too long then abort.  maxLen includes
171                    // temporary trailing space, so use '>'.
172                    if (name.length() > maxLen) {
173                        mode = 0;
174                    }
175                }
176                break;
177            }
178
179            if (c == CLOSE_DELIM) {
180                int32_t len = name.length();
181
182                // Delete trailing space, if any
183                if (len > 0 &&
184                    name.charAt(len-1) == SPACE) {
185                    --len;
186                }
187
188                if (uprv_isInvariantUString(name.getBuffer(), len)) {
189                    name.extract(0, len, cbuf, maxLen, US_INV);
190
191                    UErrorCode status = U_ZERO_ERROR;
192                    c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
193                    if (U_SUCCESS(status)) {
194                        // Lookup succeeded
195
196                        // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
197                        cursor++; // advance over CLOSE_DELIM
198
199                        str.truncate(0);
200                        str.append(c);
201                        text.handleReplaceBetween(openPos, cursor, str);
202
203                        // Adjust indices for the change in the length of
204                        // the string.  Do not assume that str.length() ==
205                        // 1, in case of surrogates.
206                        int32_t delta = cursor - openPos - str.length();
207                        cursor -= delta;
208                        limit -= delta;
209                        // assert(cursor == openPos + str.length());
210                    }
211                }
212                // If the lookup failed, we leave things as-is and
213                // still switch to mode 0 and continue.
214                mode = 0;
215                openPos = -1; // close off candidate
216                continue; // *** reprocess char32At(cursor)
217            }
218
219            // Check if c is a legal char.  We assume here that
220            // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
221            // name, we don't have to go back to openPos+1.
222            if (legal.contains(c)) {
223                name.append(c);
224                // If we go past the longest possible name then abort.
225                // maxLen includes temporary trailing space, so use '>='.
226                if (name.length() >= maxLen) {
227                    mode = 0;
228                }
229            }
230
231            // Invalid character
232            else {
233                --cursor; // Backup and reprocess this character
234                mode = 0;
235            }
236
237            break;
238        }
239
240        cursor += UTF_CHAR_LENGTH(c);
241    }
242
243    offsets.contextLimit += limit - offsets.limit;
244    offsets.limit = limit;
245    // In incremental mode, only advance the cursor up to the last
246    // open delimiter candidate.
247    offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
248
249    uprv_free(cbuf);
250}
251
252U_NAMESPACE_END
253
254#endif /* #if !UCONFIG_NO_TRANSLITERATION */
255