name2uni.cpp revision ac04d0bbe12b3ef54518635711412f178cb4d16
1/*
2**********************************************************************
3*   Copyright (C) 2001-2007, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   06/07/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/unifilt.h"
16#include "unicode/uchar.h"
17#include "unicode/uniset.h"
18#include "name2uni.h"
19#include "cmemory.h"
20#include "uprops.h"
21#include "uinvchar.h"
22#include "util.h"
23
24U_NAMESPACE_BEGIN
25
26UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
27
28static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
29static const UChar OPEN_DELIM  = 92;  // '\\' first char of OPEN
30static const UChar CLOSE_DELIM = 125; // '}'
31static const UChar SPACE       = 32;  // ' '
32
33U_CDECL_BEGIN
34
35// USetAdder implementation
36// Does not use uset.h to reduce code dependencies
37static void U_CALLCONV
38_set_add(USet *set, UChar32 c) {
39    uset_add(set, c);
40}
41
42// These functions aren't used.
43/*static void U_CALLCONV
44_set_addRange(USet *set, UChar32 start, UChar32 end) {
45    ((UnicodeSet *)set)->add(start, end);
46}
47
48static void U_CALLCONV
49_set_addString(USet *set, const UChar *str, int32_t length) {
50    ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
51}*/
52
53U_CDECL_END
54
55/**
56 * Constructs a transliterator with the default delimiters '{' and
57 * '}'.
58 */
59NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
60    Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
61
62    UnicodeSet *legalPtr = &legal;
63    // Get the legal character set
64    USetAdder sa = {
65        (USet *)legalPtr, // USet* == UnicodeSet*
66        _set_add,
67        NULL, // Don't need _set_addRange
68        NULL, // Don't need _set_addString
69        NULL // Don't need remove()
70    };
71    uprv_getCharNameCharacters(&sa);
72}
73
74/**
75 * Destructor.
76 */
77NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
78
79/**
80 * Copy constructor.
81 */
82NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
83    Transliterator(o), legal(o.legal) {}
84
85/**
86 * Assignment operator.
87 */
88/*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
89                             const NameUnicodeTransliterator& o) {
90    Transliterator::operator=(o);
91    // not necessary: the legal sets should all be the same -- legal=o.legal;
92    return *this;
93}*/
94
95/**
96 * Transliterator API.
97 */
98Transliterator* NameUnicodeTransliterator::clone(void) const {
99    return new NameUnicodeTransliterator(*this);
100}
101
102/**
103 * Implements {@link Transliterator#handleTransliterate}.
104 */
105void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
106                                                    UBool isIncremental) const {
107    // The failure mode, here and below, is to behave like Any-Null,
108    // if either there is no name data (max len == 0) or there is no
109    // memory (malloc() => NULL).
110
111    int32_t maxLen = uprv_getMaxCharNameLength();
112    if (maxLen == 0) {
113        offsets.start = offsets.limit;
114        return;
115    }
116
117    // Accomodate the longest possible name
118    ++maxLen; // allow for temporary trailing space
119    char* cbuf = (char*) uprv_malloc(maxLen);
120    if (cbuf == NULL) {
121        offsets.start = offsets.limit;
122        return;
123    }
124
125    UnicodeString openPat(TRUE, OPEN, -1);
126    UnicodeString str, name;
127
128    int32_t cursor = offsets.start;
129    int32_t limit = offsets.limit;
130
131    // Modes:
132    // 0 - looking for open delimiter
133    // 1 - after open delimiter
134    int32_t mode = 0;
135    int32_t openPos = -1; // open delim candidate pos
136
137    UChar32 c;
138    while (cursor < limit) {
139        c = text.char32At(cursor);
140
141        switch (mode) {
142        case 0: // looking for open delimiter
143            if (c == OPEN_DELIM) { // quick check first
144                openPos = cursor;
145                int32_t i =
146                    ICU_Utility::parsePattern(openPat, text, cursor, limit);
147                if (i >= 0 && i < limit) {
148                    mode = 1;
149                    name.truncate(0);
150                    cursor = i;
151                    continue; // *** reprocess char32At(cursor)
152                }
153            }
154            break;
155
156        case 1: // after open delimiter
157            // Look for legal chars.  If \s+ is found, convert it
158            // to a single space.  If closeDelimiter is found, exit
159            // the loop.  If any other character is found, exit the
160            // loop.  If the limit is reached, exit the loop.
161
162            // Convert \s+ => SPACE.  This assumes there are no
163            // runs of >1 space characters in names.
164            if (uprv_isRuleWhiteSpace(c)) {
165                // Ignore leading whitespace
166                if (name.length() > 0 &&
167                    name.charAt(name.length()-1) != SPACE) {
168                    name.append(SPACE);
169                    // If we are too long then abort.  maxLen includes
170                    // temporary trailing space, so use '>'.
171                    if (name.length() > maxLen) {
172                        mode = 0;
173                    }
174                }
175                break;
176            }
177
178            if (c == CLOSE_DELIM) {
179                int32_t len = name.length();
180
181                // Delete trailing space, if any
182                if (len > 0 &&
183                    name.charAt(len-1) == SPACE) {
184                    --len;
185                }
186
187                if (uprv_isInvariantUString(name.getBuffer(), len)) {
188                    name.extract(0, len, cbuf, maxLen, US_INV);
189
190                    UErrorCode status = U_ZERO_ERROR;
191                    c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
192                    if (U_SUCCESS(status)) {
193                        // Lookup succeeded
194
195                        // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
196                        cursor++; // advance over CLOSE_DELIM
197
198                        str.truncate(0);
199                        str.append(c);
200                        text.handleReplaceBetween(openPos, cursor, str);
201
202                        // Adjust indices for the change in the length of
203                        // the string.  Do not assume that str.length() ==
204                        // 1, in case of surrogates.
205                        int32_t delta = cursor - openPos - str.length();
206                        cursor -= delta;
207                        limit -= delta;
208                        // assert(cursor == openPos + str.length());
209                    }
210                }
211                // If the lookup failed, we leave things as-is and
212                // still switch to mode 0 and continue.
213                mode = 0;
214                openPos = -1; // close off candidate
215                continue; // *** reprocess char32At(cursor)
216            }
217
218            // Check if c is a legal char.  We assume here that
219            // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
220            // name, we don't have to go back to openPos+1.
221            if (legal.contains(c)) {
222                name.append(c);
223                // If we go past the longest possible name then abort.
224                // maxLen includes temporary trailing space, so use '>='.
225                if (name.length() >= maxLen) {
226                    mode = 0;
227                }
228            }
229
230            // Invalid character
231            else {
232                --cursor; // Backup and reprocess this character
233                mode = 0;
234            }
235
236            break;
237        }
238
239        cursor += UTF_CHAR_LENGTH(c);
240    }
241
242    offsets.contextLimit += limit - offsets.limit;
243    offsets.limit = limit;
244    // In incremental mode, only advance the cursor up to the last
245    // open delimiter candidate.
246    offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
247
248    uprv_free(cbuf);
249}
250
251U_NAMESPACE_END
252
253#endif /* #if !UCONFIG_NO_TRANSLITERATION */
254