1/*
2**********************************************************************
3*   Copyright (C) 2012-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#include "unicode/uchar.h"
11#include "unicode/utf16.h"
12
13#include "identifier_info.h"
14#include "mutex.h"
15#include "scriptset.h"
16#include "ucln_in.h"
17#include "uvector.h"
18
19U_NAMESPACE_BEGIN
20
21static UnicodeSet *ASCII;
22static ScriptSet *JAPANESE;
23static ScriptSet *CHINESE;
24static ScriptSet *KOREAN;
25static ScriptSet *CONFUSABLE_WITH_LATIN;
26static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
27
28
29U_CDECL_BEGIN
30static UBool U_CALLCONV
31IdentifierInfo_cleanup(void) {
32    delete ASCII;
33    ASCII = NULL;
34    delete JAPANESE;
35    JAPANESE = NULL;
36    delete CHINESE;
37    CHINESE = NULL;
38    delete KOREAN;
39    KOREAN = NULL;
40    delete CONFUSABLE_WITH_LATIN;
41    CONFUSABLE_WITH_LATIN = NULL;
42    gIdentifierInfoInitOnce.reset();
43    return TRUE;
44}
45
46static void U_CALLCONV
47IdentifierInfo_init(UErrorCode &status) {
48    ASCII    = new UnicodeSet(0, 0x7f);
49    JAPANESE = new ScriptSet();
50    CHINESE  = new ScriptSet();
51    KOREAN   = new ScriptSet();
52    CONFUSABLE_WITH_LATIN = new ScriptSet();
53    if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
54            || CONFUSABLE_WITH_LATIN == NULL) {
55        status = U_MEMORY_ALLOCATION_ERROR;
56        return;
57    }
58    ASCII->freeze();
59    JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
60             .set(USCRIPT_KATAKANA, status);
61    CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
62    KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
63    CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
64              .set(USCRIPT_CHEROKEE, status);
65    ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
66}
67U_CDECL_END
68
69
70IdentifierInfo::IdentifierInfo(UErrorCode &status):
71         fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
72         fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
73    umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
74    if (U_FAILURE(status)) {
75        return;
76    }
77
78    fIdentifier = new UnicodeString();
79    fRequiredScripts = new ScriptSet();
80    fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
81    uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
82    fCommonAmongAlternates = new ScriptSet();
83    fNumerics = new UnicodeSet();
84    fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
85
86    if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
87                              fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
88        status = U_MEMORY_ALLOCATION_ERROR;
89    }
90}
91
92IdentifierInfo::~IdentifierInfo() {
93    delete fIdentifier;
94    delete fRequiredScripts;
95    uhash_close(fScriptSetSet);
96    delete fCommonAmongAlternates;
97    delete fNumerics;
98    delete fIdentifierProfile;
99}
100
101
102IdentifierInfo &IdentifierInfo::clear() {
103    fRequiredScripts->resetAll();
104    uhash_removeAll(fScriptSetSet);
105    fNumerics->clear();
106    fCommonAmongAlternates->resetAll();
107    return *this;
108}
109
110
111IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
112    *fIdentifierProfile = identifierProfile;
113    return *this;
114}
115
116
117const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
118    return *fIdentifierProfile;
119}
120
121
122IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
123    if (U_FAILURE(status)) {
124        return *this;
125    }
126    *fIdentifier = identifier;
127    clear();
128    ScriptSet scriptsForCP;
129    UChar32 cp;
130    for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
131        cp = identifier.char32At(i);
132        // Store a representative character for each kind of decimal digit
133        if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
134            // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
135            fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
136        }
137        UScriptCode extensions[500];
138        int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
139        if (U_FAILURE(status)) {
140            return *this;
141        }
142        scriptsForCP.resetAll();
143        for (int32_t j=0; j<extensionsCount; j++) {
144            scriptsForCP.set(extensions[j], status);
145        }
146        scriptsForCP.reset(USCRIPT_COMMON, status);
147        scriptsForCP.reset(USCRIPT_INHERITED, status);
148        switch (scriptsForCP.countMembers()) {
149          case 0: break;
150          case 1:
151            // Single script, record it.
152            fRequiredScripts->Union(scriptsForCP);
153            break;
154          default:
155            if (!fRequiredScripts->intersects(scriptsForCP)
156                    && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
157                // If the set hasn't been added already, add it
158                //    (Add a copy, fScriptSetSet takes ownership of the copy.)
159                uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
160            }
161            break;
162        }
163    }
164    // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
165    // [Kana], [Kana Hira] => [Kana]
166    // This is relatively infrequent, so doesn't have to be optimized.
167    // We also compute any commonalities among the alternates.
168    if (uhash_count(fScriptSetSet) > 0) {
169        fCommonAmongAlternates->setAll();
170        for (int32_t it = UHASH_FIRST;;) {
171            const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
172            if (nextHashEl == NULL) {
173                break;
174            }
175            ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
176            // [Kana], [Kana Hira] => [Kana]
177            if (fRequiredScripts->intersects(*next)) {
178                uhash_removeElement(fScriptSetSet, nextHashEl);
179            } else {
180                fCommonAmongAlternates->intersect(*next);
181                // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
182                for (int32_t otherIt = UHASH_FIRST;;) {
183                    const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
184                    if (otherHashEl == NULL) {
185                        break;
186                    }
187                    ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
188                    if (next != other && next->contains(*other)) {
189                        uhash_removeElement(fScriptSetSet, nextHashEl);
190                        break;
191                    }
192                }
193            }
194        }
195    }
196    if (uhash_count(fScriptSetSet) == 0) {
197        fCommonAmongAlternates->resetAll();
198    }
199    return *this;
200}
201
202
203const UnicodeString *IdentifierInfo::getIdentifier() const {
204    return fIdentifier;
205}
206
207const ScriptSet *IdentifierInfo::getScripts() const {
208    return fRequiredScripts;
209}
210
211const UHashtable *IdentifierInfo::getAlternates() const {
212    return fScriptSetSet;
213}
214
215
216const UnicodeSet *IdentifierInfo::getNumerics() const {
217    return fNumerics;
218}
219
220const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
221    return fCommonAmongAlternates;
222}
223
224#if !UCONFIG_NO_NORMALIZATION
225
226URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
227    if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
228        return USPOOF_UNRESTRICTIVE;
229    }
230    if (ASCII->containsAll(*fIdentifier)) {
231        return USPOOF_ASCII;
232    }
233    // This is a bit tricky. We look at a number of factors.
234    // The number of scripts in the text.
235    // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
236    // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
237
238    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
239    //       time it is created, in setIdentifier().
240    int32_t cardinalityPlus = fRequiredScripts->countMembers() +
241            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
242    if (cardinalityPlus < 2) {
243        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
244    }
245    if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
246            || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
247        return USPOOF_HIGHLY_RESTRICTIVE;
248    }
249    if (cardinalityPlus == 2 &&
250            fRequiredScripts->test(USCRIPT_LATIN, status) &&
251            !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
252        return USPOOF_MODERATELY_RESTRICTIVE;
253    }
254    return USPOOF_MINIMALLY_RESTRICTIVE;
255}
256
257#endif /* !UCONFIG_NO_NORMALIZATION */
258
259int32_t IdentifierInfo::getScriptCount() const {
260    // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
261    int32_t count = fRequiredScripts->countMembers() +
262            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
263    return count;
264}
265
266
267
268UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
269    if (!container.contains(containee)) {
270        return FALSE;
271    }
272    for (int32_t iter = UHASH_FIRST; ;) {
273        const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
274        if (hashEl == NULL) {
275            break;
276        }
277        ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
278        if (!container.intersects(*alternatives)) {
279            return false;
280        }
281    }
282    return true;
283}
284
285UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
286    UVector sorted(status);
287    if (U_FAILURE(status)) {
288        return dest;
289    }
290    for (int32_t pos = UHASH_FIRST; ;) {
291        const UHashElement *el = uhash_nextElement(alternates, &pos);
292        if (el == NULL) {
293            break;
294        }
295        ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
296        sorted.addElement(ss, status);
297    }
298    sorted.sort(uhash_compareScriptSet, status);
299    UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
300    for (int32_t i=0; i<sorted.size(); i++) {
301        if (i>0) {
302            dest.append(separator);
303        }
304        ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
305        ss->displayScripts(dest);
306    }
307    return dest;
308}
309
310U_NAMESPACE_END
311
312