1/* 2********************************************************************** 3* Copyright (C) 2012-2013, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6*/ 7 8#include "unicode/utypes.h" 9 10#include "unicode/uchar.h" 11#include "unicode/utf16.h" 12 13#include "identifier_info.h" 14#include "mutex.h" 15#include "scriptset.h" 16#include "ucln_in.h" 17#include "uvector.h" 18 19U_NAMESPACE_BEGIN 20 21#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 22 23static UMutex gInitMutex = U_MUTEX_INITIALIZER; 24static UBool gStaticsAreInitialized = FALSE; 25 26UnicodeSet *IdentifierInfo::ASCII; 27ScriptSet *IdentifierInfo::JAPANESE; 28ScriptSet *IdentifierInfo::CHINESE; 29ScriptSet *IdentifierInfo::KOREAN; 30ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; 31 32UBool IdentifierInfo::cleanup() { 33 delete ASCII; 34 ASCII = NULL; 35 delete JAPANESE; 36 JAPANESE = NULL; 37 delete CHINESE; 38 CHINESE = NULL; 39 delete KOREAN; 40 KOREAN = NULL; 41 delete CONFUSABLE_WITH_LATIN; 42 CONFUSABLE_WITH_LATIN = NULL; 43 gStaticsAreInitialized = FALSE; 44 return TRUE; 45} 46 47U_CDECL_BEGIN 48static UBool U_CALLCONV 49IdentifierInfo_cleanup(void) { 50 return IdentifierInfo::cleanup(); 51} 52U_CDECL_END 53 54 55IdentifierInfo::IdentifierInfo(UErrorCode &status): 56 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 57 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { 58 if (U_FAILURE(status)) { 59 return; 60 } 61 { 62 Mutex lock(&gInitMutex); 63 if (!gStaticsAreInitialized) { 64 ASCII = new UnicodeSet(0, 0x7f); 65 JAPANESE = new ScriptSet(); 66 CHINESE = new ScriptSet(); 67 KOREAN = new ScriptSet(); 68 CONFUSABLE_WITH_LATIN = new ScriptSet(); 69 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 70 || CONFUSABLE_WITH_LATIN == NULL) { 71 status = U_MEMORY_ALLOCATION_ERROR; 72 return; 73 } 74 ASCII->freeze(); 75 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) 76 .set(USCRIPT_KATAKANA, status); 77 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); 78 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); 79 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) 80 .set(USCRIPT_CHEROKEE, status); 81 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); 82 gStaticsAreInitialized = TRUE; 83 } 84 } 85 fIdentifier = new UnicodeString(); 86 fRequiredScripts = new ScriptSet(); 87 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); 88 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); 89 fCommonAmongAlternates = new ScriptSet(); 90 fNumerics = new UnicodeSet(); 91 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); 92 93 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || 94 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { 95 status = U_MEMORY_ALLOCATION_ERROR; 96 } 97} 98 99IdentifierInfo::~IdentifierInfo() { 100 delete fIdentifier; 101 delete fRequiredScripts; 102 uhash_close(fScriptSetSet); 103 delete fCommonAmongAlternates; 104 delete fNumerics; 105 delete fIdentifierProfile; 106} 107 108 109IdentifierInfo &IdentifierInfo::clear() { 110 fRequiredScripts->resetAll(); 111 uhash_removeAll(fScriptSetSet); 112 fNumerics->clear(); 113 fCommonAmongAlternates->resetAll(); 114 return *this; 115} 116 117 118IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { 119 *fIdentifierProfile = identifierProfile; 120 return *this; 121} 122 123 124const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { 125 return *fIdentifierProfile; 126} 127 128 129IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { 130 if (U_FAILURE(status)) { 131 return *this; 132 } 133 *fIdentifier = identifier; 134 clear(); 135 ScriptSet scriptsForCP; 136 UChar32 cp; 137 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { 138 cp = identifier.char32At(i); 139 // Store a representative character for each kind of decimal digit 140 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { 141 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value 142 fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); 143 } 144 UScriptCode extensions[500]; 145 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); 146 if (U_FAILURE(status)) { 147 return *this; 148 } 149 scriptsForCP.resetAll(); 150 for (int32_t j=0; j<extensionsCount; j++) { 151 scriptsForCP.set(extensions[j], status); 152 } 153 scriptsForCP.reset(USCRIPT_COMMON, status); 154 scriptsForCP.reset(USCRIPT_INHERITED, status); 155 switch (scriptsForCP.countMembers()) { 156 case 0: break; 157 case 1: 158 // Single script, record it. 159 fRequiredScripts->Union(scriptsForCP); 160 break; 161 default: 162 if (!fRequiredScripts->intersects(scriptsForCP) 163 && !uhash_geti(fScriptSetSet, &scriptsForCP)) { 164 // If the set hasn't been added already, add it 165 // (Add a copy, fScriptSetSet takes ownership of the copy.) 166 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); 167 } 168 break; 169 } 170 } 171 // Now make a final pass through ScriptSetSet to remove alternates that came before singles. 172 // [Kana], [Kana Hira] => [Kana] 173 // This is relatively infrequent, so doesn't have to be optimized. 174 // We also compute any commonalities among the alternates. 175 if (uhash_count(fScriptSetSet) > 0) { 176 fCommonAmongAlternates->setAll(); 177 for (int32_t it = -1;;) { 178 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); 179 if (nextHashEl == NULL) { 180 break; 181 } 182 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); 183 // [Kana], [Kana Hira] => [Kana] 184 if (fRequiredScripts->intersects(*next)) { 185 uhash_removeElement(fScriptSetSet, nextHashEl); 186 } else { 187 fCommonAmongAlternates->intersect(*next); 188 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] 189 for (int32_t otherIt = -1;;) { 190 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); 191 if (otherHashEl == NULL) { 192 break; 193 } 194 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); 195 if (next != other && next->contains(*other)) { 196 uhash_removeElement(fScriptSetSet, nextHashEl); 197 break; 198 } 199 } 200 } 201 } 202 } 203 if (uhash_count(fScriptSetSet) == 0) { 204 fCommonAmongAlternates->resetAll(); 205 } 206 return *this; 207} 208 209 210const UnicodeString *IdentifierInfo::getIdentifier() const { 211 return fIdentifier; 212} 213 214const ScriptSet *IdentifierInfo::getScripts() const { 215 return fRequiredScripts; 216} 217 218const UHashtable *IdentifierInfo::getAlternates() const { 219 return fScriptSetSet; 220} 221 222 223const UnicodeSet *IdentifierInfo::getNumerics() const { 224 return fNumerics; 225} 226 227const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { 228 return fCommonAmongAlternates; 229} 230 231#if !UCONFIG_NO_NORMALIZATION 232 233URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { 234 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { 235 return USPOOF_UNRESTRICTIVE; 236 } 237 if (ASCII->containsAll(*fIdentifier)) { 238 return USPOOF_ASCII; 239 } 240 // This is a bit tricky. We look at a number of factors. 241 // The number of scripts in the text. 242 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) 243 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) 244 245 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the 246 // time it is created, in setIdentifier(). 247 int32_t cardinalityPlus = fRequiredScripts->countMembers() + 248 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 249 if (cardinalityPlus < 2) { 250 return USPOOF_HIGHLY_RESTRICTIVE; 251 } 252 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) 253 || containsWithAlternates(*KOREAN, *fRequiredScripts)) { 254 return USPOOF_HIGHLY_RESTRICTIVE; 255 } 256 if (cardinalityPlus == 2 && 257 fRequiredScripts->test(USCRIPT_LATIN, status) && 258 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { 259 return USPOOF_MODERATELY_RESTRICTIVE; 260 } 261 return USPOOF_MINIMALLY_RESTRICTIVE; 262} 263 264#endif /* !UCONFIG_NO_NORMALIZATION */ 265 266int32_t IdentifierInfo::getScriptCount() const { 267 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. 268 int32_t count = fRequiredScripts->countMembers() + 269 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 270 return count; 271} 272 273 274 275UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { 276 if (!container.contains(containee)) { 277 return FALSE; 278 } 279 for (int32_t iter = -1; ;) { 280 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); 281 if (hashEl == NULL) { 282 break; 283 } 284 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); 285 if (!container.intersects(*alternatives)) { 286 return false; 287 } 288 } 289 return true; 290} 291 292UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { 293 UVector sorted(status); 294 if (U_FAILURE(status)) { 295 return dest; 296 } 297 for (int32_t pos = -1; ;) { 298 const UHashElement *el = uhash_nextElement(alternates, &pos); 299 if (el == NULL) { 300 break; 301 } 302 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); 303 sorted.addElement(ss, status); 304 } 305 sorted.sort(uhash_compareScriptSet, status); 306 UnicodeString separator = UNICODE_STRING_SIMPLE("; "); 307 for (int32_t i=0; i<sorted.size(); i++) { 308 if (i>0) { 309 dest.append(separator); 310 } 311 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); 312 ss->displayScripts(dest); 313 } 314 return dest; 315} 316 317U_NAMESPACE_END 318 319