1/*
2*****************************************************************
3* Copyright (c) 2002-2011, International Business Machines Corporation
4* and others.  All Rights Reserved.
5*****************************************************************
6* Date        Name        Description
7* 06/06/2002  aliu        Creation.
8*****************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uobject.h"
16#include "unicode/uscript.h"
17#include "nultrans.h"
18#include "anytrans.h"
19#include "uvector.h"
20#include "tridpars.h"
21#include "hash.h"
22#include "putilimp.h"
23#include "uinvchar.h"
24
25//------------------------------------------------------------
26// Constants
27
28static const UChar TARGET_SEP = 45; // '-'
29static const UChar VARIANT_SEP = 47; // '/'
30static const UChar ANY[] = {65,110,121,0}; // "Any"
31static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
32static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
33
34//------------------------------------------------------------
35
36U_CDECL_BEGIN
37/**
38 * Deleter function for Transliterator*.
39 */
40static void U_CALLCONV
41_deleteTransliterator(void *obj) {
42    delete (icu::Transliterator*) obj;
43}
44U_CDECL_END
45
46//------------------------------------------------------------
47
48U_NAMESPACE_BEGIN
49
50//------------------------------------------------------------
51// ScriptRunIterator
52
53/**
54 * Returns a series of ranges corresponding to scripts. They will be
55 * of the form:
56 *
57 * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
58 * |            |          - first run (start, limit)
59 *          |           |  - second run (start, limit)
60 *
61 * That is, the runs will overlap. The reason for this is so that a
62 * transliterator can consider common characters both before and after
63 * the scripts.
64 */
65class ScriptRunIterator : public UMemory {
66private:
67    const Replaceable& text;
68    int32_t textStart;
69    int32_t textLimit;
70
71public:
72    /**
73     * The code of the current run, valid after next() returns.  May
74     * be USCRIPT_INVALID_CODE if and only if the entire text is
75     * COMMON/INHERITED.
76     */
77    UScriptCode scriptCode;
78
79    /**
80     * The start of the run, inclusive, valid after next() returns.
81     */
82    int32_t start;
83
84    /**
85     * The end of the run, exclusive, valid after next() returns.
86     */
87    int32_t limit;
88
89    /**
90     * Constructs a run iterator over the given text from start
91     * (inclusive) to limit (exclusive).
92     */
93    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
94
95    /**
96     * Returns TRUE if there are any more runs.  TRUE is always
97     * returned at least once.  Upon return, the caller should
98     * examine scriptCode, start, and limit.
99     */
100    UBool next();
101
102    /**
103     * Adjusts internal indices for a change in the limit index of the
104     * given delta.  A positive delta means the limit has increased.
105     */
106    void adjustLimit(int32_t delta);
107
108private:
109    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
110    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
111};
112
113ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
114                                     int32_t myStart, int32_t myLimit) :
115    text(theText)
116{
117    textStart = myStart;
118    textLimit = myLimit;
119    limit = myStart;
120}
121
122UBool ScriptRunIterator::next() {
123    UChar32 ch;
124    UScriptCode s;
125    UErrorCode ec = U_ZERO_ERROR;
126
127    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
128    start = limit;
129
130    // Are we done?
131    if (start == textLimit) {
132        return FALSE;
133    }
134
135    // Move start back to include adjacent COMMON or INHERITED
136    // characters
137    while (start > textStart) {
138        ch = text.char32At(start - 1); // look back
139        s = uscript_getScript(ch, &ec);
140        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
141            --start;
142        } else {
143            break;
144        }
145    }
146
147    // Move limit ahead to include COMMON, INHERITED, and characters
148    // of the current script.
149    while (limit < textLimit) {
150        ch = text.char32At(limit); // look ahead
151        s = uscript_getScript(ch, &ec);
152        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
153            if (scriptCode == USCRIPT_INVALID_CODE) {
154                scriptCode = s;
155            } else if (s != scriptCode) {
156                break;
157            }
158        }
159        ++limit;
160    }
161
162    // Return TRUE even if the entire text is COMMON / INHERITED, in
163    // which case scriptCode will be USCRIPT_INVALID_CODE.
164    return TRUE;
165}
166
167void ScriptRunIterator::adjustLimit(int32_t delta) {
168    limit += delta;
169    textLimit += delta;
170}
171
172//------------------------------------------------------------
173// AnyTransliterator
174
175UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
176
177AnyTransliterator::AnyTransliterator(const UnicodeString& id,
178                                     const UnicodeString& theTarget,
179                                     const UnicodeString& theVariant,
180                                     UScriptCode theTargetScript,
181                                     UErrorCode& ec) :
182    Transliterator(id, NULL),
183    targetScript(theTargetScript)
184{
185    cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
186    if (U_FAILURE(ec)) {
187        return;
188    }
189    uhash_setValueDeleter(cache, _deleteTransliterator);
190
191    target = theTarget;
192    if (theVariant.length() > 0) {
193        target.append(VARIANT_SEP).append(theVariant);
194    }
195}
196
197AnyTransliterator::~AnyTransliterator() {
198    uhash_close(cache);
199}
200
201/**
202 * Copy constructor.
203 */
204AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
205    Transliterator(o),
206    target(o.target),
207    targetScript(o.targetScript)
208{
209    // Don't copy the cache contents
210    UErrorCode ec = U_ZERO_ERROR;
211    cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
212    if (U_FAILURE(ec)) {
213        return;
214    }
215    uhash_setValueDeleter(cache, _deleteTransliterator);
216}
217
218/**
219 * Transliterator API.
220 */
221Transliterator* AnyTransliterator::clone() const {
222    return new AnyTransliterator(*this);
223}
224
225/**
226 * Implements {@link Transliterator#handleTransliterate}.
227 */
228void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
229                                            UBool isIncremental) const {
230    int32_t allStart = pos.start;
231    int32_t allLimit = pos.limit;
232
233    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
234
235    while (it.next()) {
236        // Ignore runs in the ante context
237        if (it.limit <= allStart) continue;
238
239        // Try to instantiate transliterator from it.scriptCode to
240        // our target or target/variant
241        Transliterator* t = getTransliterator(it.scriptCode);
242
243        if (t == NULL) {
244            // We have no transliterator.  Do nothing, but keep
245            // pos.start up to date.
246            pos.start = it.limit;
247            continue;
248        }
249
250        // If the run end is before the transliteration limit, do
251        // a non-incremental transliteration.  Otherwise do an
252        // incremental one.
253        UBool incremental = isIncremental && (it.limit >= allLimit);
254
255        pos.start = uprv_max(allStart, it.start);
256        pos.limit = uprv_min(allLimit, it.limit);
257        int32_t limit = pos.limit;
258        t->filteredTransliterate(text, pos, incremental);
259        int32_t delta = pos.limit - limit;
260        allLimit += delta;
261        it.adjustLimit(delta);
262
263        // We're done if we enter the post context
264        if (it.limit >= allLimit) break;
265    }
266
267    // Restore limit.  pos.start is fine where the last transliterator
268    // left it, or at the end of the last run.
269    pos.limit = allLimit;
270}
271
272Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
273
274    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
275        return NULL;
276    }
277
278    Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
279    if (t == NULL) {
280        UErrorCode ec = U_ZERO_ERROR;
281        UnicodeString sourceName(uscript_getName(source), -1, US_INV);
282        UnicodeString id(sourceName);
283        id.append(TARGET_SEP).append(target);
284
285        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
286        if (U_FAILURE(ec) || t == NULL) {
287            delete t;
288
289            // Try to pivot around Latin, our most common script
290            id = sourceName;
291            id.append(LATIN_PIVOT, -1).append(target);
292            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
293            if (U_FAILURE(ec) || t == NULL) {
294                delete t;
295                t = NULL;
296            }
297        }
298
299        if (t != NULL) {
300            uhash_iput(cache, (int32_t) source, t, &ec);
301        }
302    }
303
304    return t;
305}
306
307/**
308 * Return the script code for a given name, or -1 if not found.
309 */
310static UScriptCode scriptNameToCode(const UnicodeString& name) {
311    char buf[128];
312    UScriptCode code;
313    UErrorCode ec = U_ZERO_ERROR;
314    int32_t nameLen = name.length();
315    UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
316
317    if (isInvariant) {
318        name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
319        buf[127] = 0;   // Make sure that we NULL terminate the string.
320    }
321    if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
322    {
323        code = USCRIPT_INVALID_CODE;
324    }
325    return code;
326}
327
328/**
329 * Registers standard transliterators with the system.  Called by
330 * Transliterator during initialization.  Scan all current targets and
331 * register those that are scripts T as Any-T/V.
332 */
333void AnyTransliterator::registerIDs() {
334
335    UErrorCode ec = U_ZERO_ERROR;
336    Hashtable seen(TRUE, ec);
337
338    int32_t sourceCount = Transliterator::_countAvailableSources();
339    for (int32_t s=0; s<sourceCount; ++s) {
340        UnicodeString source;
341        Transliterator::_getAvailableSource(s, source);
342
343        // Ignore the "Any" source
344        if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
345
346        int32_t targetCount = Transliterator::_countAvailableTargets(source);
347        for (int32_t t=0; t<targetCount; ++t) {
348            UnicodeString target;
349            Transliterator::_getAvailableTarget(t, source, target);
350
351            // Only process each target once
352            if (seen.geti(target) != 0) continue;
353            ec = U_ZERO_ERROR;
354            seen.puti(target, 1, ec);
355
356            // Get the script code for the target.  If not a script, ignore.
357            UScriptCode targetScript = scriptNameToCode(target);
358            if (targetScript == USCRIPT_INVALID_CODE) continue;
359
360            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
361            // assert(variantCount >= 1);
362            for (int32_t v=0; v<variantCount; ++v) {
363                UnicodeString variant;
364                Transliterator::_getAvailableVariant(v, source, target, variant);
365
366                UnicodeString id;
367                TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
368                ec = U_ZERO_ERROR;
369                AnyTransliterator* t = new AnyTransliterator(id, target, variant,
370                                                             targetScript, ec);
371                if (U_FAILURE(ec)) {
372                    delete t;
373                } else {
374                    Transliterator::_registerInstance(t);
375                    Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
376                }
377            }
378        }
379    }
380}
381
382U_NAMESPACE_END
383
384#endif /* #if !UCONFIG_NO_TRANSLITERATION */
385
386//eof
387