identifier_info.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1/*
2**********************************************************************
3*   Copyright (C) 2012-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#include "unicode/uchar.h"
11#include "unicode/utf16.h"
12
13#include "identifier_info.h"
14#include "mutex.h"
15#include "scriptset.h"
16#include "ucln_in.h"
17#include "uvector.h"
18
19U_NAMESPACE_BEGIN
20
21#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
22
23static UnicodeSet *ASCII;
24static ScriptSet *JAPANESE;
25static ScriptSet *CHINESE;
26static ScriptSet *KOREAN;
27static ScriptSet *CONFUSABLE_WITH_LATIN;
28static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
29
30
31U_CDECL_BEGIN
32static UBool U_CALLCONV
33IdentifierInfo_cleanup(void) {
34    delete ASCII;
35    ASCII = NULL;
36    delete JAPANESE;
37    JAPANESE = NULL;
38    delete CHINESE;
39    CHINESE = NULL;
40    delete KOREAN;
41    KOREAN = NULL;
42    delete CONFUSABLE_WITH_LATIN;
43    CONFUSABLE_WITH_LATIN = NULL;
44    gIdentifierInfoInitOnce.reset();
45    return TRUE;
46}
47
48static void U_CALLCONV
49IdentifierInfo_init(UErrorCode &status) {
50    ASCII    = new UnicodeSet(0, 0x7f);
51    JAPANESE = new ScriptSet();
52    CHINESE  = new ScriptSet();
53    KOREAN   = new ScriptSet();
54    CONFUSABLE_WITH_LATIN = new ScriptSet();
55    if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
56            || CONFUSABLE_WITH_LATIN == NULL) {
57        status = U_MEMORY_ALLOCATION_ERROR;
58        return;
59    }
60    ASCII->freeze();
61    JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
62             .set(USCRIPT_KATAKANA, status);
63    CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
64    KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
65    CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
66              .set(USCRIPT_CHEROKEE, status);
67    ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
68}
69U_CDECL_END
70
71
72IdentifierInfo::IdentifierInfo(UErrorCode &status):
73         fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
74         fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
75    umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
76    if (U_FAILURE(status)) {
77        return;
78    }
79
80    fIdentifier = new UnicodeString();
81    fRequiredScripts = new ScriptSet();
82    fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
83    uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
84    fCommonAmongAlternates = new ScriptSet();
85    fNumerics = new UnicodeSet();
86    fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
87
88    if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
89                              fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
90        status = U_MEMORY_ALLOCATION_ERROR;
91    }
92}
93
94IdentifierInfo::~IdentifierInfo() {
95    delete fIdentifier;
96    delete fRequiredScripts;
97    uhash_close(fScriptSetSet);
98    delete fCommonAmongAlternates;
99    delete fNumerics;
100    delete fIdentifierProfile;
101}
102
103
104IdentifierInfo &IdentifierInfo::clear() {
105    fRequiredScripts->resetAll();
106    uhash_removeAll(fScriptSetSet);
107    fNumerics->clear();
108    fCommonAmongAlternates->resetAll();
109    return *this;
110}
111
112
113IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
114    *fIdentifierProfile = identifierProfile;
115    return *this;
116}
117
118
119const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
120    return *fIdentifierProfile;
121}
122
123
124IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
125    if (U_FAILURE(status)) {
126        return *this;
127    }
128    *fIdentifier = identifier;
129    clear();
130    ScriptSet scriptsForCP;
131    UChar32 cp;
132    for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
133        cp = identifier.char32At(i);
134        // Store a representative character for each kind of decimal digit
135        if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
136            // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
137            fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
138        }
139        UScriptCode extensions[500];
140        int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
141        if (U_FAILURE(status)) {
142            return *this;
143        }
144        scriptsForCP.resetAll();
145        for (int32_t j=0; j<extensionsCount; j++) {
146            scriptsForCP.set(extensions[j], status);
147        }
148        scriptsForCP.reset(USCRIPT_COMMON, status);
149        scriptsForCP.reset(USCRIPT_INHERITED, status);
150        switch (scriptsForCP.countMembers()) {
151          case 0: break;
152          case 1:
153            // Single script, record it.
154            fRequiredScripts->Union(scriptsForCP);
155            break;
156          default:
157            if (!fRequiredScripts->intersects(scriptsForCP)
158                    && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
159                // If the set hasn't been added already, add it
160                //    (Add a copy, fScriptSetSet takes ownership of the copy.)
161                uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
162            }
163            break;
164        }
165    }
166    // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
167    // [Kana], [Kana Hira] => [Kana]
168    // This is relatively infrequent, so doesn't have to be optimized.
169    // We also compute any commonalities among the alternates.
170    if (uhash_count(fScriptSetSet) > 0) {
171        fCommonAmongAlternates->setAll();
172        for (int32_t it = -1;;) {
173            const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
174            if (nextHashEl == NULL) {
175                break;
176            }
177            ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
178            // [Kana], [Kana Hira] => [Kana]
179            if (fRequiredScripts->intersects(*next)) {
180                uhash_removeElement(fScriptSetSet, nextHashEl);
181            } else {
182                fCommonAmongAlternates->intersect(*next);
183                // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
184                for (int32_t otherIt = -1;;) {
185                    const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
186                    if (otherHashEl == NULL) {
187                        break;
188                    }
189                    ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
190                    if (next != other && next->contains(*other)) {
191                        uhash_removeElement(fScriptSetSet, nextHashEl);
192                        break;
193                    }
194                }
195            }
196        }
197    }
198    if (uhash_count(fScriptSetSet) == 0) {
199        fCommonAmongAlternates->resetAll();
200    }
201    return *this;
202}
203
204
205const UnicodeString *IdentifierInfo::getIdentifier() const {
206    return fIdentifier;
207}
208
209const ScriptSet *IdentifierInfo::getScripts() const {
210    return fRequiredScripts;
211}
212
213const UHashtable *IdentifierInfo::getAlternates() const {
214    return fScriptSetSet;
215}
216
217
218const UnicodeSet *IdentifierInfo::getNumerics() const {
219    return fNumerics;
220}
221
222const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
223    return fCommonAmongAlternates;
224}
225
226#if !UCONFIG_NO_NORMALIZATION
227
228URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
229    if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
230        return USPOOF_UNRESTRICTIVE;
231    }
232    if (ASCII->containsAll(*fIdentifier)) {
233        return USPOOF_ASCII;
234    }
235    // This is a bit tricky. We look at a number of factors.
236    // The number of scripts in the text.
237    // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
238    // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
239
240    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
241    //       time it is created, in setIdentifier().
242    int32_t cardinalityPlus = fRequiredScripts->countMembers() +
243            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
244    if (cardinalityPlus < 2) {
245        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
246    }
247    if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
248            || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
249        return USPOOF_HIGHLY_RESTRICTIVE;
250    }
251    if (cardinalityPlus == 2 &&
252            fRequiredScripts->test(USCRIPT_LATIN, status) &&
253            !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
254        return USPOOF_MODERATELY_RESTRICTIVE;
255    }
256    return USPOOF_MINIMALLY_RESTRICTIVE;
257}
258
259#endif /* !UCONFIG_NO_NORMALIZATION */
260
261int32_t IdentifierInfo::getScriptCount() const {
262    // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
263    int32_t count = fRequiredScripts->countMembers() +
264            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
265    return count;
266}
267
268
269
270UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
271    if (!container.contains(containee)) {
272        return FALSE;
273    }
274    for (int32_t iter = -1; ;) {
275        const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
276        if (hashEl == NULL) {
277            break;
278        }
279        ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
280        if (!container.intersects(*alternatives)) {
281            return false;
282        }
283    }
284    return true;
285}
286
287UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
288    UVector sorted(status);
289    if (U_FAILURE(status)) {
290        return dest;
291    }
292    for (int32_t pos = -1; ;) {
293        const UHashElement *el = uhash_nextElement(alternates, &pos);
294        if (el == NULL) {
295            break;
296        }
297        ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
298        sorted.addElement(ss, status);
299    }
300    sorted.sort(uhash_compareScriptSet, status);
301    UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
302    for (int32_t i=0; i<sorted.size(); i++) {
303        if (i>0) {
304            dest.append(separator);
305        }
306        ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
307        ss->displayScripts(dest);
308    }
309    return dest;
310}
311
312U_NAMESPACE_END
313
314