1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 1996-2016, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.text;
10
11import java.util.HashMap;
12import java.util.Map;
13
14/**
15 * <code>RuleBasedTransliterator</code> is a transliterator
16 * that reads a set of rules in order to determine how to perform
17 * translations. Rule sets are stored in resource bundles indexed by
18 * name. Rules within a rule set are separated by semicolons (';').
19 * To include a literal semicolon, prefix it with a backslash ('\').
20 * Unicode Pattern_White_Space is ignored.
21 * If the first non-blank character on a line is '#',
22 * the entire line is ignored as a comment.
23 *
24 * <p>Each set of rules consists of two groups, one forward, and one
25 * reverse. This is a convention that is not enforced; rules for one
26 * direction may be omitted, with the result that translations in
27 * that direction will not modify the source text. In addition,
28 * bidirectional forward-reverse rules may be specified for
29 * symmetrical transformations.
30 *
31 * <p><b>Rule syntax</b>
32 *
33 * <p>Rule statements take one of the following forms:
34 *
35 * <dl>
36 *     <dt><code>$alefmadda=\u0622;</code></dt>
37 *     <dd><strong>Variable definition.</strong> The name on the
38 *         left is assigned the text on the right. In this example,
39 *         after this statement, instances of the left hand name,
40 *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
41 *         the Unicode character U+0622. Variable names must begin
42 *         with a letter and consist only of letters, digits, and
43 *         underscores. Case is significant. Duplicate names cause
44 *         an exception to be thrown, that is, variables cannot be
45 *         redefined. The right hand side may contain well-formed
46 *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
47 *         The right hand side may contain embedded <code>UnicodeSet</code>
48 *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
49 *     <dd>&nbsp;</dd>
50 *     <dt><code>ai&gt;$alefmadda;</code></dt>
51 *     <dd><strong>Forward translation rule.</strong> This rule
52 *         states that the string on the left will be changed to the
53 *         string on the right when performing forward
54 *         transliteration.</dd>
55 *     <dt>&nbsp;</dt>
56 *     <dt><code>ai&lt;$alefmadda;</code></dt>
57 *     <dd><strong>Reverse translation rule.</strong> This rule
58 *         states that the string on the right will be changed to
59 *         the string on the left when performing reverse
60 *         transliteration.</dd>
61 * </dl>
62 *
63 * <dl>
64 *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
65 *     <dd><strong>Bidirectional translation rule.</strong> This
66 *         rule states that the string on the right will be changed
67 *         to the string on the left when performing forward
68 *         transliteration, and vice versa when performing reverse
69 *         transliteration.</dd>
70 * </dl>
71 *
72 * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
73 * string</em>. The match pattern consists of literal characters,
74 * optionally preceded by context, and optionally followed by
75 * context. Context characters, like literal pattern characters,
76 * must be matched in the text being transliterated. However, unlike
77 * literal pattern characters, they are not replaced by the output
78 * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
79 * indicates the characters &quot;<code>def</code>&quot; must be
80 * preceded by &quot;<code>abc</code>&quot; for a successful match.
81 * If there is a successful match, &quot;<code>def</code>&quot; will
82 * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
83 * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
84 * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
85 * (or &quot;<code>123}456</code>&quot;) in which the literal
86 * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
87 *
88 * <p>The output string of a forward or reverse rule consists of
89 * characters to replace the literal pattern characters. If the
90 * output string contains the character '<code>|</code>', this is
91 * taken to indicate the location of the <em>cursor</em> after
92 * replacement. The cursor is the point in the text at which the
93 * next replacement, if any, will be applied. The cursor is usually
94 * placed within the replacement text; however, it can actually be
95 * placed into the precending or following context by using the
96 * special character '<code>@</code>'. Examples:
97 *
98 * <blockquote>
99 *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
100 *     before a<br>
101 *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
102 *     y and z</code>
103 * </blockquote>
104 *
105 * <p><b>UnicodeSet</b>
106 *
107 * <p><code>UnicodeSet</code> patterns may appear anywhere that
108 * makes sense. They may appear in variable definitions.
109 * Contrariwise, <code>UnicodeSet</code> patterns may themselves
110 * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
111 * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.
112 *
113 * <p><code>UnicodeSet</code> patterns may also be embedded directly
114 * into rule strings. Thus, the following two rules are equivalent:
115 *
116 * <blockquote>
117 *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
118 *     [aeiou]&gt;'*';
119 *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
120 *     Another way</code>
121 * </blockquote>
122 *
123 * <p>See {@link UnicodeSet} for more documentation and examples.
124 *
125 * <p><b>Segments</b>
126 *
127 * <p>Segments of the input string can be matched and copied to the
128 * output string. This makes certain sets of rules simpler and more
129 * general, and makes reordering possible. For example:
130 *
131 * <blockquote>
132 *     <p><code>([a-z]) &gt; $1 $1;
133 *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
134 *     double lowercase letters<br>
135 *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code>
136 * </blockquote>
137 *
138 * <p>The segment of the input string to be copied is delimited by
139 * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
140 * nine segments may be defined. Segments may not overlap. In the
141 * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
142 * represent the input string segments, in left-to-right order of
143 * definition.
144 *
145 * <p><b>Anchors</b>
146 *
147 * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
148 * special characters '<code>^</code>' and '<code>$</code>'. For example:
149 *
150 * <blockquote>
151 *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
152 *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
153 *   of 'a'<br>
154 *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>
155 *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
156 *   of 'z'</code>
157 * </blockquote>
158 *
159 * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
160 * This is done by including a virtual anchor character '<code>$</code>' at the end of the
161 * set pattern. Although this is usually the match chafacter for the end anchor, the set will
162 * match either the beginning or the end of the text, depending on its placement. For
163 * example:
164 *
165 * <blockquote>
166 *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
167 *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
168 *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code>
169 * </blockquote>
170 *
171 * <p><b>Example</b>
172 *
173 * <p>The following example rules illustrate many of the features of
174 * the rule language.
175 *
176 * <table border="0" cellpadding="4">
177 *     <tr>
178 *         <td style="vertical-align: top;">Rule 1.</td>
179 *         <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}&gt;x|y</code></td>
180 *     </tr>
181 *     <tr>
182 *         <td style="vertical-align: top;">Rule 2.</td>
183 *         <td style="vertical-align: top; write-space: nowrap;"><code>xyz&gt;r</code></td>
184 *     </tr>
185 *     <tr>
186 *         <td style="vertical-align: top;">Rule 3.</td>
187 *         <td style="vertical-align: top; write-space: nowrap;"><code>yz&gt;q</code></td>
188 *     </tr>
189 * </table>
190 *
191 * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
192 * yields the following results:
193 *
194 * <table border="0" cellpadding="4">
195 *     <tr>
196 *         <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td>
197 *         <td style="vertical-align: top;">Initial state, no rules match. Advance
198 *         cursor.</td>
199 *     </tr>
200 *     <tr>
201 *         <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td>
202 *         <td style="vertical-align: top;">Still no match. Rule 1 does not match
203 *         because the preceding context is not present.</td>
204 *     </tr>
205 *     <tr>
206 *         <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td>
207 *         <td style="vertical-align: top;">Still no match. Keep advancing until
208 *         there is a match...</td>
209 *     </tr>
210 *     <tr>
211 *         <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td>
212 *         <td style="vertical-align: top;">...</td>
213 *     </tr>
214 *     <tr>
215 *         <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td>
216 *         <td style="vertical-align: top;">...</td>
217 *     </tr>
218 *     <tr>
219 *         <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td>
220 *         <td style="vertical-align: top;">...</td>
221 *     </tr>
222 *     <tr>
223 *         <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td>
224 *         <td style="vertical-align: top;">...</td>
225 *     </tr>
226 *     <tr>
227 *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td>
228 *         <td style="vertical-align: top;">Rule 1 matches; replace &quot;<code>def</code>&quot;
229 *         with &quot;<code>xy</code>&quot; and back up the cursor
230 *         to before the '<code>y</code>'.</td>
231 *     </tr>
232 *     <tr>
233 *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td>
234 *         <td style="vertical-align: top;">Although &quot;<code>xyz</code>&quot; is
235 *         present, rule 2 does not match because the cursor is
236 *         before the '<code>y</code>', not before the '<code>x</code>'.
237 *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
238 *         with &quot;<code>q</code>&quot;.</td>
239 *     </tr>
240 *     <tr>
241 *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td>
242 *         <td style="vertical-align: top;">The cursor is at the end;
243 *         transliteration is complete.</td>
244 *     </tr>
245 * </table>
246 *
247 * <p>The order of rules is significant. If multiple rules may match
248 * at some point, the first matching rule is applied.
249 *
250 * <p>Forward and reverse rules may have an empty output string.
251 * Otherwise, an empty left or right hand side of any statement is a
252 * syntax error.
253 *
254 * <p>Single quotes are used to quote any character other than a
255 * digit or letter. To specify a single quote itself, inside or
256 * outside of quotes, use two single quotes in a row. For example,
257 * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
258 * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
259 *
260 * <p><b>Notes</b>
261 *
262 * <p>While a RuleBasedTransliterator is being built, it checks that
263 * the rules are added in proper order. For example, if the rule
264 * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
265 * then the second rule will throw an exception. The reason is that
266 * the second rule can never be triggered, since the first rule
267 * always matches anything it matches. In other words, the first
268 * rule <em>masks</em> the second rule.
269 *
270 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.
271 *
272 * @author Alan Liu
273 * @internal
274 * @deprecated This API is ICU internal only.
275 */
276@Deprecated
277public class RuleBasedTransliterator extends Transliterator {
278
279    private final Data data;
280
281//    /**
282//     * Constructs a new transliterator from the given rules.
283//     * @param rules rules, separated by ';'
284//     * @param direction either FORWARD or REVERSE.
285//     * @exception IllegalArgumentException if rules are malformed
286//     * or direction is invalid.
287//     */
288//     public RuleBasedTransliterator(String ID, String rules, int direction,
289//                                   UnicodeFilter filter) {
290//        super(ID, filter);
291//        if (direction != FORWARD && direction != REVERSE) {
292//            throw new IllegalArgumentException("Invalid direction");
293//        }
294//
295//        TransliteratorParser parser = new TransliteratorParser();
296//        parser.parse(rules, direction);
297//        if (parser.idBlockVector.size() != 0 ||
298//            parser.compoundFilter != null) {
299//            throw new IllegalArgumentException("::ID blocks illegal in RuleBasedTransliterator constructor");
300//        }
301//
302//        data = (Data)parser.dataVector.get(0);
303//        setMaximumContextLength(data.ruleSet.getMaximumContextLength());
304//     }
305
306//    /**
307//     * Constructs a new transliterator from the given rules in the
308//     * <code>FORWARD</code> direction.
309//     * @param rules rules, separated by ';'
310//     * @exception IllegalArgumentException if rules are malformed
311//     * or direction is invalid.
312//     */
313//    public RuleBasedTransliterator(String ID, String rules) {
314//        this(ID, rules, FORWARD, null);
315//    }
316
317    RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
318        super(ID, filter);
319        this.data = data;
320        setMaximumContextLength(data.ruleSet.getMaximumContextLength());
321    }
322
323    /**
324     * Implements {@link Transliterator#handleTransliterate}.
325     * @internal
326     * @deprecated This API is ICU internal only.
327     */
328    @Override
329    @Deprecated
330    protected void handleTransliterate(Replaceable text,
331                                       Position index, boolean incremental) {
332        /* We keep start and limit fixed the entire time,
333         * relative to the text -- limit may move numerically if text is
334         * inserted or removed.  The cursor moves from start to limit, with
335         * replacements happening under it.
336         *
337         * Example: rules 1. ab>x|y
338         *                2. yc>z
339         *
340         * |eabcd   start - no match, advance cursor
341         * e|abcd   match rule 1 - change text & adjust cursor
342         * ex|ycd   match rule 2 - change text & adjust cursor
343         * exz|d    no match, advance cursor
344         * exzd|    done
345         */
346
347        /* A rule like
348         *   a>b|a
349         * creates an infinite loop. To prevent that, we put an arbitrary
350         * limit on the number of iterations that we take, one that is
351         * high enough that any reasonable rules are ok, but low enough to
352         * prevent a server from hanging.  The limit is 16 times the
353         * number of characters n, unless n is so large that 16n exceeds a
354         * uint32_t.
355         */
356        synchronized(data)  {
357            int loopCount = 0;
358            int loopLimit = (index.limit - index.start) << 4;
359            if (loopLimit < 0) {
360                loopLimit = 0x7FFFFFFF;
361            }
362
363            while (index.start < index.limit &&
364                    loopCount <= loopLimit &&
365                    data.ruleSet.transliterate(text, index, incremental)) {
366                ++loopCount;
367            }
368        }
369    }
370
371
372    static class Data {
373        public Data() {
374            variableNames = new HashMap<String, char[]>();
375            ruleSet = new TransliterationRuleSet();
376        }
377
378        /**
379         * Rule table.  May be empty.
380         */
381        public TransliterationRuleSet ruleSet;
382
383        /**
384         * Map variable name (String) to variable (char[]).  A variable name
385         * corresponds to zero or more characters, stored in a char[] array in
386         * this hash.  One or more of these chars may also correspond to a
387         * UnicodeSet, in which case the character in the char[] in this hash is
388         * a stand-in: it is an index for a secondary lookup in
389         * data.variables.  The stand-in also represents the UnicodeSet in
390         * the stored rules.
391         */
392        Map<String, char[]> variableNames;
393
394        /**
395         * Map category variable (Character) to UnicodeMatcher or UnicodeReplacer.
396         * Variables that correspond to a set of characters are mapped
397         * from variable name to a stand-in character in data.variableNames.
398         * The stand-in then serves as a key in this hash to lookup the
399         * actual UnicodeSet object.  In addition, the stand-in is
400         * stored in the rule text to represent the set of characters.
401         * variables[i] represents character (variablesBase + i).
402         */
403        Object[] variables;
404
405        /**
406         * The character that represents variables[0].  Characters
407         * variablesBase through variablesBase +
408         * variables.length - 1 represent UnicodeSet objects.
409         */
410        char variablesBase;
411
412        /**
413         * Return the UnicodeMatcher represented by the given character, or
414         * null if none.
415         */
416        public UnicodeMatcher lookupMatcher(int standIn) {
417            int i = standIn - variablesBase;
418            return (i >= 0 && i < variables.length)
419                ? (UnicodeMatcher) variables[i] : null;
420        }
421
422        /**
423         * Return the UnicodeReplacer represented by the given character, or
424         * null if none.
425         */
426        public UnicodeReplacer lookupReplacer(int standIn) {
427            int i = standIn - variablesBase;
428            return (i >= 0 && i < variables.length)
429                ? (UnicodeReplacer) variables[i] : null;
430        }
431    }
432
433
434    /**
435     * Return a representation of this transliterator as source rules.
436     * These rules will produce an equivalent transliterator if used
437     * to construct a new transliterator.
438     * @param escapeUnprintable if TRUE then convert unprintable
439     * character to their hex escape representations, \\uxxxx or
440     * \\Uxxxxxxxx.  Unprintable characters are those other than
441     * U+000A, U+0020..U+007E.
442     * @return rules string
443     * @internal
444     * @deprecated This API is ICU internal only.
445     */
446    @Override
447    @Deprecated
448    public String toRules(boolean escapeUnprintable) {
449        return data.ruleSet.toRules(escapeUnprintable);
450    }
451
452//    /**
453//     * Return the set of all characters that may be modified by this
454//     * Transliterator, ignoring the effect of our filter.
455//     */
456//    protected UnicodeSet handleGetSourceSet() {
457//        return data.ruleSet.getSourceTargetSet(false, unicodeFilter);
458//    }
459//
460//    /**
461//     * Returns the set of all characters that may be generated as
462//     * replacement text by this transliterator.
463//     */
464//    public UnicodeSet getTargetSet() {
465//        return data.ruleSet.getSourceTargetSet(true, unicodeFilter);
466//    }
467
468    /**
469     * @internal
470     * @deprecated This API is ICU internal only.
471     */
472    @Deprecated
473    @Override
474    public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
475        data.ruleSet.addSourceTargetSet(filter, sourceSet, targetSet);
476    }
477
478    /**
479     * Temporary hack for registry problem. Needs to be replaced by better architecture.
480     * @internal
481     * @deprecated This API is ICU internal only.
482     */
483    @Deprecated
484    public Transliterator safeClone() {
485        UnicodeFilter filter = getFilter();
486        if (filter != null && filter instanceof UnicodeSet) {
487            filter = new UnicodeSet((UnicodeSet)filter);
488        }
489        return new RuleBasedTransliterator(getID(), data, filter);
490    }
491}
492
493
494