1/*
2**********************************************************************
3*   Copyright (C) 2012-2013, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#include "unicode/uchar.h"
11#include "unicode/utf16.h"
12
13#include "identifier_info.h"
14#include "mutex.h"
15#include "scriptset.h"
16#include "ucln_in.h"
17#include "uvector.h"
18
19U_NAMESPACE_BEGIN
20
21#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
22
23static UMutex gInitMutex = U_MUTEX_INITIALIZER;
24static UBool gStaticsAreInitialized = FALSE;
25
26UnicodeSet *IdentifierInfo::ASCII;
27ScriptSet *IdentifierInfo::JAPANESE;
28ScriptSet *IdentifierInfo::CHINESE;
29ScriptSet *IdentifierInfo::KOREAN;
30ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
31
32UBool IdentifierInfo::cleanup() {
33    delete ASCII;
34    ASCII = NULL;
35    delete JAPANESE;
36    JAPANESE = NULL;
37    delete CHINESE;
38    CHINESE = NULL;
39    delete KOREAN;
40    KOREAN = NULL;
41    delete CONFUSABLE_WITH_LATIN;
42    CONFUSABLE_WITH_LATIN = NULL;
43    gStaticsAreInitialized = FALSE;
44    return TRUE;
45}
46
47U_CDECL_BEGIN
48static UBool U_CALLCONV
49IdentifierInfo_cleanup(void) {
50    return IdentifierInfo::cleanup();
51}
52U_CDECL_END
53
54
55IdentifierInfo::IdentifierInfo(UErrorCode &status):
56         fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
57         fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
58    if (U_FAILURE(status)) {
59        return;
60    }
61    {
62        Mutex lock(&gInitMutex);
63        if (!gStaticsAreInitialized) {
64            ASCII    = new UnicodeSet(0, 0x7f);
65            JAPANESE = new ScriptSet();
66            CHINESE  = new ScriptSet();
67            KOREAN   = new ScriptSet();
68            CONFUSABLE_WITH_LATIN = new ScriptSet();
69            if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
70                    || CONFUSABLE_WITH_LATIN == NULL) {
71                status = U_MEMORY_ALLOCATION_ERROR;
72                return;
73            }
74            ASCII->freeze();
75            JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
76                     .set(USCRIPT_KATAKANA, status);
77            CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
78            KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
79            CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
80                      .set(USCRIPT_CHEROKEE, status);
81            ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
82            gStaticsAreInitialized = TRUE;
83        }
84    }
85    fIdentifier = new UnicodeString();
86    fRequiredScripts = new ScriptSet();
87    fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
88    uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
89    fCommonAmongAlternates = new ScriptSet();
90    fNumerics = new UnicodeSet();
91    fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
92
93    if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
94                              fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
95        status = U_MEMORY_ALLOCATION_ERROR;
96    }
97}
98
99IdentifierInfo::~IdentifierInfo() {
100    delete fIdentifier;
101    delete fRequiredScripts;
102    uhash_close(fScriptSetSet);
103    delete fCommonAmongAlternates;
104    delete fNumerics;
105    delete fIdentifierProfile;
106}
107
108
109IdentifierInfo &IdentifierInfo::clear() {
110    fRequiredScripts->resetAll();
111    uhash_removeAll(fScriptSetSet);
112    fNumerics->clear();
113    fCommonAmongAlternates->resetAll();
114    return *this;
115}
116
117
118IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
119    *fIdentifierProfile = identifierProfile;
120    return *this;
121}
122
123
124const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
125    return *fIdentifierProfile;
126}
127
128
129IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
130    if (U_FAILURE(status)) {
131        return *this;
132    }
133    *fIdentifier = identifier;
134    clear();
135    ScriptSet scriptsForCP;
136    UChar32 cp;
137    for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
138        cp = identifier.char32At(i);
139        // Store a representative character for each kind of decimal digit
140        if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
141            // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
142            fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
143        }
144        UScriptCode extensions[500];
145        int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
146        if (U_FAILURE(status)) {
147            return *this;
148        }
149        scriptsForCP.resetAll();
150        for (int32_t j=0; j<extensionsCount; j++) {
151            scriptsForCP.set(extensions[j], status);
152        }
153        scriptsForCP.reset(USCRIPT_COMMON, status);
154        scriptsForCP.reset(USCRIPT_INHERITED, status);
155        switch (scriptsForCP.countMembers()) {
156          case 0: break;
157          case 1:
158            // Single script, record it.
159            fRequiredScripts->Union(scriptsForCP);
160            break;
161          default:
162            if (!fRequiredScripts->intersects(scriptsForCP)
163                    && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
164                // If the set hasn't been added already, add it
165                //    (Add a copy, fScriptSetSet takes ownership of the copy.)
166                uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
167            }
168            break;
169        }
170    }
171    // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
172    // [Kana], [Kana Hira] => [Kana]
173    // This is relatively infrequent, so doesn't have to be optimized.
174    // We also compute any commonalities among the alternates.
175    if (uhash_count(fScriptSetSet) > 0) {
176        fCommonAmongAlternates->setAll();
177        for (int32_t it = -1;;) {
178            const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
179            if (nextHashEl == NULL) {
180                break;
181            }
182            ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
183            // [Kana], [Kana Hira] => [Kana]
184            if (fRequiredScripts->intersects(*next)) {
185                uhash_removeElement(fScriptSetSet, nextHashEl);
186            } else {
187                fCommonAmongAlternates->intersect(*next);
188                // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
189                for (int32_t otherIt = -1;;) {
190                    const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
191                    if (otherHashEl == NULL) {
192                        break;
193                    }
194                    ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
195                    if (next != other && next->contains(*other)) {
196                        uhash_removeElement(fScriptSetSet, nextHashEl);
197                        break;
198                    }
199                }
200            }
201        }
202    }
203    if (uhash_count(fScriptSetSet) == 0) {
204        fCommonAmongAlternates->resetAll();
205    }
206    return *this;
207}
208
209
210const UnicodeString *IdentifierInfo::getIdentifier() const {
211    return fIdentifier;
212}
213
214const ScriptSet *IdentifierInfo::getScripts() const {
215    return fRequiredScripts;
216}
217
218const UHashtable *IdentifierInfo::getAlternates() const {
219    return fScriptSetSet;
220}
221
222
223const UnicodeSet *IdentifierInfo::getNumerics() const {
224    return fNumerics;
225}
226
227const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
228    return fCommonAmongAlternates;
229}
230
231#if !UCONFIG_NO_NORMALIZATION
232
233URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
234    if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
235        return USPOOF_UNRESTRICTIVE;
236    }
237    if (ASCII->containsAll(*fIdentifier)) {
238        return USPOOF_ASCII;
239    }
240    // This is a bit tricky. We look at a number of factors.
241    // The number of scripts in the text.
242    // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
243    // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
244
245    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
246    //       time it is created, in setIdentifier().
247    int32_t cardinalityPlus = fRequiredScripts->countMembers() +
248            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
249    if (cardinalityPlus < 2) {
250        return USPOOF_HIGHLY_RESTRICTIVE;
251    }
252    if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
253            || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
254        return USPOOF_HIGHLY_RESTRICTIVE;
255    }
256    if (cardinalityPlus == 2 &&
257            fRequiredScripts->test(USCRIPT_LATIN, status) &&
258            !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
259        return USPOOF_MODERATELY_RESTRICTIVE;
260    }
261    return USPOOF_MINIMALLY_RESTRICTIVE;
262}
263
264#endif /* !UCONFIG_NO_NORMALIZATION */
265
266int32_t IdentifierInfo::getScriptCount() const {
267    // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
268    int32_t count = fRequiredScripts->countMembers() +
269            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
270    return count;
271}
272
273
274
275UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
276    if (!container.contains(containee)) {
277        return FALSE;
278    }
279    for (int32_t iter = -1; ;) {
280        const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
281        if (hashEl == NULL) {
282            break;
283        }
284        ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
285        if (!container.intersects(*alternatives)) {
286            return false;
287        }
288    }
289    return true;
290}
291
292UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
293    UVector sorted(status);
294    if (U_FAILURE(status)) {
295        return dest;
296    }
297    for (int32_t pos = -1; ;) {
298        const UHashElement *el = uhash_nextElement(alternates, &pos);
299        if (el == NULL) {
300            break;
301        }
302        ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
303        sorted.addElement(ss, status);
304    }
305    sorted.sort(uhash_compareScriptSet, status);
306    UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
307    for (int32_t i=0; i<sorted.size(); i++) {
308        if (i>0) {
309            dest.append(separator);
310        }
311        ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
312        ss->displayScripts(dest);
313    }
314    return dest;
315}
316
317U_NAMESPACE_END
318
319