1/*
2*****************************************************************
3* Copyright (c) 2002-2014, International Business Machines Corporation
4* and others.  All Rights Reserved.
5*****************************************************************
6* Date        Name        Description
7* 06/06/2002  aliu        Creation.
8*****************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uobject.h"
16#include "unicode/uscript.h"
17
18#include "anytrans.h"
19#include "hash.h"
20#include "mutex.h"
21#include "nultrans.h"
22#include "putilimp.h"
23#include "tridpars.h"
24#include "uinvchar.h"
25#include "uvector.h"
26
27//------------------------------------------------------------
28// Constants
29
30static const UChar TARGET_SEP = 45; // '-'
31static const UChar VARIANT_SEP = 47; // '/'
32static const UChar ANY[] = {65,110,121,0}; // "Any"
33static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
34static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
35
36//------------------------------------------------------------
37
38U_CDECL_BEGIN
39/**
40 * Deleter function for Transliterator*.
41 */
42static void U_CALLCONV
43_deleteTransliterator(void *obj) {
44    delete (icu::Transliterator*) obj;
45}
46U_CDECL_END
47
48//------------------------------------------------------------
49
50U_NAMESPACE_BEGIN
51
52//------------------------------------------------------------
53// ScriptRunIterator
54
55/**
56 * Returns a series of ranges corresponding to scripts. They will be
57 * of the form:
58 *
59 * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
60 * |            |          - first run (start, limit)
61 *          |           |  - second run (start, limit)
62 *
63 * That is, the runs will overlap. The reason for this is so that a
64 * transliterator can consider common characters both before and after
65 * the scripts.
66 */
67class ScriptRunIterator : public UMemory {
68private:
69    const Replaceable& text;
70    int32_t textStart;
71    int32_t textLimit;
72
73public:
74    /**
75     * The code of the current run, valid after next() returns.  May
76     * be USCRIPT_INVALID_CODE if and only if the entire text is
77     * COMMON/INHERITED.
78     */
79    UScriptCode scriptCode;
80
81    /**
82     * The start of the run, inclusive, valid after next() returns.
83     */
84    int32_t start;
85
86    /**
87     * The end of the run, exclusive, valid after next() returns.
88     */
89    int32_t limit;
90
91    /**
92     * Constructs a run iterator over the given text from start
93     * (inclusive) to limit (exclusive).
94     */
95    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
96
97    /**
98     * Returns TRUE if there are any more runs.  TRUE is always
99     * returned at least once.  Upon return, the caller should
100     * examine scriptCode, start, and limit.
101     */
102    UBool next();
103
104    /**
105     * Adjusts internal indices for a change in the limit index of the
106     * given delta.  A positive delta means the limit has increased.
107     */
108    void adjustLimit(int32_t delta);
109
110private:
111    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
112    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
113};
114
115ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
116                                     int32_t myStart, int32_t myLimit) :
117    text(theText)
118{
119    textStart = myStart;
120    textLimit = myLimit;
121    limit = myStart;
122}
123
124UBool ScriptRunIterator::next() {
125    UChar32 ch;
126    UScriptCode s;
127    UErrorCode ec = U_ZERO_ERROR;
128
129    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
130    start = limit;
131
132    // Are we done?
133    if (start == textLimit) {
134        return FALSE;
135    }
136
137    // Move start back to include adjacent COMMON or INHERITED
138    // characters
139    while (start > textStart) {
140        ch = text.char32At(start - 1); // look back
141        s = uscript_getScript(ch, &ec);
142        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
143            --start;
144        } else {
145            break;
146        }
147    }
148
149    // Move limit ahead to include COMMON, INHERITED, and characters
150    // of the current script.
151    while (limit < textLimit) {
152        ch = text.char32At(limit); // look ahead
153        s = uscript_getScript(ch, &ec);
154        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
155            if (scriptCode == USCRIPT_INVALID_CODE) {
156                scriptCode = s;
157            } else if (s != scriptCode) {
158                break;
159            }
160        }
161        ++limit;
162    }
163
164    // Return TRUE even if the entire text is COMMON / INHERITED, in
165    // which case scriptCode will be USCRIPT_INVALID_CODE.
166    return TRUE;
167}
168
169void ScriptRunIterator::adjustLimit(int32_t delta) {
170    limit += delta;
171    textLimit += delta;
172}
173
174//------------------------------------------------------------
175// AnyTransliterator
176
177UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
178
179AnyTransliterator::AnyTransliterator(const UnicodeString& id,
180                                     const UnicodeString& theTarget,
181                                     const UnicodeString& theVariant,
182                                     UScriptCode theTargetScript,
183                                     UErrorCode& ec) :
184    Transliterator(id, NULL),
185    targetScript(theTargetScript)
186{
187    cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
188    if (U_FAILURE(ec)) {
189        return;
190    }
191    uhash_setValueDeleter(cache, _deleteTransliterator);
192
193    target = theTarget;
194    if (theVariant.length() > 0) {
195        target.append(VARIANT_SEP).append(theVariant);
196    }
197}
198
199AnyTransliterator::~AnyTransliterator() {
200    uhash_close(cache);
201}
202
203/**
204 * Copy constructor.
205 */
206AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
207    Transliterator(o),
208    target(o.target),
209    targetScript(o.targetScript)
210{
211    // Don't copy the cache contents
212    UErrorCode ec = U_ZERO_ERROR;
213    cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
214    if (U_FAILURE(ec)) {
215        return;
216    }
217    uhash_setValueDeleter(cache, _deleteTransliterator);
218}
219
220/**
221 * Transliterator API.
222 */
223Transliterator* AnyTransliterator::clone() const {
224    return new AnyTransliterator(*this);
225}
226
227/**
228 * Implements {@link Transliterator#handleTransliterate}.
229 */
230void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
231                                            UBool isIncremental) const {
232    int32_t allStart = pos.start;
233    int32_t allLimit = pos.limit;
234
235    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
236
237    while (it.next()) {
238        // Ignore runs in the ante context
239        if (it.limit <= allStart) continue;
240
241        // Try to instantiate transliterator from it.scriptCode to
242        // our target or target/variant
243        Transliterator* t = getTransliterator(it.scriptCode);
244
245        if (t == NULL) {
246            // We have no transliterator.  Do nothing, but keep
247            // pos.start up to date.
248            pos.start = it.limit;
249            continue;
250        }
251
252        // If the run end is before the transliteration limit, do
253        // a non-incremental transliteration.  Otherwise do an
254        // incremental one.
255        UBool incremental = isIncremental && (it.limit >= allLimit);
256
257        pos.start = uprv_max(allStart, it.start);
258        pos.limit = uprv_min(allLimit, it.limit);
259        int32_t limit = pos.limit;
260        t->filteredTransliterate(text, pos, incremental);
261        int32_t delta = pos.limit - limit;
262        allLimit += delta;
263        it.adjustLimit(delta);
264
265        // We're done if we enter the post context
266        if (it.limit >= allLimit) break;
267    }
268
269    // Restore limit.  pos.start is fine where the last transliterator
270    // left it, or at the end of the last run.
271    pos.limit = allLimit;
272}
273
274Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
275
276    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
277        return NULL;
278    }
279
280    Transliterator* t = NULL;
281    {
282        Mutex m(NULL);
283        t = (Transliterator*) uhash_iget(cache, (int32_t) source);
284    }
285    if (t == NULL) {
286        UErrorCode ec = U_ZERO_ERROR;
287        UnicodeString sourceName(uscript_getName(source), -1, US_INV);
288        UnicodeString id(sourceName);
289        id.append(TARGET_SEP).append(target);
290
291        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
292        if (U_FAILURE(ec) || t == NULL) {
293            delete t;
294
295            // Try to pivot around Latin, our most common script
296            id = sourceName;
297            id.append(LATIN_PIVOT, -1).append(target);
298            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
299            if (U_FAILURE(ec) || t == NULL) {
300                delete t;
301                t = NULL;
302            }
303        }
304
305        if (t != NULL) {
306            Transliterator *rt = NULL;
307            {
308                Mutex m(NULL);
309                rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
310                if (rt == NULL) {
311                    // Common case, no race to cache this new transliterator.
312                    uhash_iput(cache, (int32_t) source, t, &ec);
313                } else {
314                    // Race case, some other thread beat us to caching this transliterator.
315                    Transliterator *temp = rt;
316                    rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
317                    t  = temp; // The transliterator from the cache that we will return.
318                }
319            }
320            delete rt;    // will be non-null only in case of races.
321        }
322    }
323    return t;
324}
325
326/**
327 * Return the script code for a given name, or -1 if not found.
328 */
329static UScriptCode scriptNameToCode(const UnicodeString& name) {
330    char buf[128];
331    UScriptCode code;
332    UErrorCode ec = U_ZERO_ERROR;
333    int32_t nameLen = name.length();
334    UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
335
336    if (isInvariant) {
337        name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
338        buf[127] = 0;   // Make sure that we NULL terminate the string.
339    }
340    if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
341    {
342        code = USCRIPT_INVALID_CODE;
343    }
344    return code;
345}
346
347/**
348 * Registers standard transliterators with the system.  Called by
349 * Transliterator during initialization.  Scan all current targets and
350 * register those that are scripts T as Any-T/V.
351 */
352void AnyTransliterator::registerIDs() {
353
354    UErrorCode ec = U_ZERO_ERROR;
355    Hashtable seen(TRUE, ec);
356
357    int32_t sourceCount = Transliterator::_countAvailableSources();
358    for (int32_t s=0; s<sourceCount; ++s) {
359        UnicodeString source;
360        Transliterator::_getAvailableSource(s, source);
361
362        // Ignore the "Any" source
363        if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
364
365        int32_t targetCount = Transliterator::_countAvailableTargets(source);
366        for (int32_t t=0; t<targetCount; ++t) {
367            UnicodeString target;
368            Transliterator::_getAvailableTarget(t, source, target);
369
370            // Only process each target once
371            if (seen.geti(target) != 0) continue;
372            ec = U_ZERO_ERROR;
373            seen.puti(target, 1, ec);
374
375            // Get the script code for the target.  If not a script, ignore.
376            UScriptCode targetScript = scriptNameToCode(target);
377            if (targetScript == USCRIPT_INVALID_CODE) continue;
378
379            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
380            // assert(variantCount >= 1);
381            for (int32_t v=0; v<variantCount; ++v) {
382                UnicodeString variant;
383                Transliterator::_getAvailableVariant(v, source, target, variant);
384
385                UnicodeString id;
386                TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
387                ec = U_ZERO_ERROR;
388                AnyTransliterator* t = new AnyTransliterator(id, target, variant,
389                                                             targetScript, ec);
390                if (U_FAILURE(ec)) {
391                    delete t;
392                } else {
393                    Transliterator::_registerInstance(t);
394                    Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
395                }
396            }
397        }
398    }
399}
400
401U_NAMESPACE_END
402
403#endif /* #if !UCONFIG_NO_TRANSLITERATION */
404
405//eof
406