name2uni.cpp revision ac04d0bbe12b3ef54518635711412f178cb4d16
1/* 2********************************************************************** 3* Copyright (C) 2001-2007, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* Date Name Description 7* 06/07/01 aliu Creation. 8********************************************************************** 9*/ 10 11#include "unicode/utypes.h" 12 13#if !UCONFIG_NO_TRANSLITERATION 14 15#include "unicode/unifilt.h" 16#include "unicode/uchar.h" 17#include "unicode/uniset.h" 18#include "name2uni.h" 19#include "cmemory.h" 20#include "uprops.h" 21#include "uinvchar.h" 22#include "util.h" 23 24U_NAMESPACE_BEGIN 25 26UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator) 27 28static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~" 29static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN 30static const UChar CLOSE_DELIM = 125; // '}' 31static const UChar SPACE = 32; // ' ' 32 33U_CDECL_BEGIN 34 35// USetAdder implementation 36// Does not use uset.h to reduce code dependencies 37static void U_CALLCONV 38_set_add(USet *set, UChar32 c) { 39 uset_add(set, c); 40} 41 42// These functions aren't used. 43/*static void U_CALLCONV 44_set_addRange(USet *set, UChar32 start, UChar32 end) { 45 ((UnicodeSet *)set)->add(start, end); 46} 47 48static void U_CALLCONV 49_set_addString(USet *set, const UChar *str, int32_t length) { 50 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 51}*/ 52 53U_CDECL_END 54 55/** 56 * Constructs a transliterator with the default delimiters '{' and 57 * '}'. 58 */ 59NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) : 60 Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) { 61 62 UnicodeSet *legalPtr = &legal; 63 // Get the legal character set 64 USetAdder sa = { 65 (USet *)legalPtr, // USet* == UnicodeSet* 66 _set_add, 67 NULL, // Don't need _set_addRange 68 NULL, // Don't need _set_addString 69 NULL // Don't need remove() 70 }; 71 uprv_getCharNameCharacters(&sa); 72} 73 74/** 75 * Destructor. 76 */ 77NameUnicodeTransliterator::~NameUnicodeTransliterator() {} 78 79/** 80 * Copy constructor. 81 */ 82NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) : 83 Transliterator(o), legal(o.legal) {} 84 85/** 86 * Assignment operator. 87 */ 88/*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=( 89 const NameUnicodeTransliterator& o) { 90 Transliterator::operator=(o); 91 // not necessary: the legal sets should all be the same -- legal=o.legal; 92 return *this; 93}*/ 94 95/** 96 * Transliterator API. 97 */ 98Transliterator* NameUnicodeTransliterator::clone(void) const { 99 return new NameUnicodeTransliterator(*this); 100} 101 102/** 103 * Implements {@link Transliterator#handleTransliterate}. 104 */ 105void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 106 UBool isIncremental) const { 107 // The failure mode, here and below, is to behave like Any-Null, 108 // if either there is no name data (max len == 0) or there is no 109 // memory (malloc() => NULL). 110 111 int32_t maxLen = uprv_getMaxCharNameLength(); 112 if (maxLen == 0) { 113 offsets.start = offsets.limit; 114 return; 115 } 116 117 // Accomodate the longest possible name 118 ++maxLen; // allow for temporary trailing space 119 char* cbuf = (char*) uprv_malloc(maxLen); 120 if (cbuf == NULL) { 121 offsets.start = offsets.limit; 122 return; 123 } 124 125 UnicodeString openPat(TRUE, OPEN, -1); 126 UnicodeString str, name; 127 128 int32_t cursor = offsets.start; 129 int32_t limit = offsets.limit; 130 131 // Modes: 132 // 0 - looking for open delimiter 133 // 1 - after open delimiter 134 int32_t mode = 0; 135 int32_t openPos = -1; // open delim candidate pos 136 137 UChar32 c; 138 while (cursor < limit) { 139 c = text.char32At(cursor); 140 141 switch (mode) { 142 case 0: // looking for open delimiter 143 if (c == OPEN_DELIM) { // quick check first 144 openPos = cursor; 145 int32_t i = 146 ICU_Utility::parsePattern(openPat, text, cursor, limit); 147 if (i >= 0 && i < limit) { 148 mode = 1; 149 name.truncate(0); 150 cursor = i; 151 continue; // *** reprocess char32At(cursor) 152 } 153 } 154 break; 155 156 case 1: // after open delimiter 157 // Look for legal chars. If \s+ is found, convert it 158 // to a single space. If closeDelimiter is found, exit 159 // the loop. If any other character is found, exit the 160 // loop. If the limit is reached, exit the loop. 161 162 // Convert \s+ => SPACE. This assumes there are no 163 // runs of >1 space characters in names. 164 if (uprv_isRuleWhiteSpace(c)) { 165 // Ignore leading whitespace 166 if (name.length() > 0 && 167 name.charAt(name.length()-1) != SPACE) { 168 name.append(SPACE); 169 // If we are too long then abort. maxLen includes 170 // temporary trailing space, so use '>'. 171 if (name.length() > maxLen) { 172 mode = 0; 173 } 174 } 175 break; 176 } 177 178 if (c == CLOSE_DELIM) { 179 int32_t len = name.length(); 180 181 // Delete trailing space, if any 182 if (len > 0 && 183 name.charAt(len-1) == SPACE) { 184 --len; 185 } 186 187 if (uprv_isInvariantUString(name.getBuffer(), len)) { 188 name.extract(0, len, cbuf, maxLen, US_INV); 189 190 UErrorCode status = U_ZERO_ERROR; 191 c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); 192 if (U_SUCCESS(status)) { 193 // Lookup succeeded 194 195 // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1); 196 cursor++; // advance over CLOSE_DELIM 197 198 str.truncate(0); 199 str.append(c); 200 text.handleReplaceBetween(openPos, cursor, str); 201 202 // Adjust indices for the change in the length of 203 // the string. Do not assume that str.length() == 204 // 1, in case of surrogates. 205 int32_t delta = cursor - openPos - str.length(); 206 cursor -= delta; 207 limit -= delta; 208 // assert(cursor == openPos + str.length()); 209 } 210 } 211 // If the lookup failed, we leave things as-is and 212 // still switch to mode 0 and continue. 213 mode = 0; 214 openPos = -1; // close off candidate 215 continue; // *** reprocess char32At(cursor) 216 } 217 218 // Check if c is a legal char. We assume here that 219 // legal.contains(OPEN_DELIM) is FALSE, so when we abort a 220 // name, we don't have to go back to openPos+1. 221 if (legal.contains(c)) { 222 name.append(c); 223 // If we go past the longest possible name then abort. 224 // maxLen includes temporary trailing space, so use '>='. 225 if (name.length() >= maxLen) { 226 mode = 0; 227 } 228 } 229 230 // Invalid character 231 else { 232 --cursor; // Backup and reprocess this character 233 mode = 0; 234 } 235 236 break; 237 } 238 239 cursor += UTF_CHAR_LENGTH(c); 240 } 241 242 offsets.contextLimit += limit - offsets.limit; 243 offsets.limit = limit; 244 // In incremental mode, only advance the cursor up to the last 245 // open delimiter candidate. 246 offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; 247 248 uprv_free(cbuf); 249} 250 251U_NAMESPACE_END 252 253#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 254