1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18// BEGIN android-note
19// The icu implementation used was changed from icu4j to icu4jni.
20// END android-note
21
22package java.text;
23
24import org.apache.harmony.text.internal.nls.Messages;
25
26/**
27 * A concrete implementation class for {@code Collation}.
28 * <p>
29 * {@code RuleBasedCollator} has the following restrictions for efficiency
30 * (other subclasses may be used for more complex languages):
31 * <ol>
32 * <li> If a French secondary ordering is specified it applies to the whole
33 * collator object.</li>
34 * <li> All non-mentioned Unicode characters are at the end of the collation
35 * order.</li>
36 * <li> If a character is not located in the {@code RuleBasedCollator}, the
37 * default Unicode Collation Algorithm (UCA) rulebased table is automatically
38 * searched as a backup.</li>
39 * </ol>
40 * <p>
41 * The collation table is composed of a list of collation rules, where each rule
42 * is of three forms:
43 * <blockquote>
44 *
45 * <pre>
46 * <modifier>
47 * <relation> <text-argument>
48 * <reset> <text-argument>
49 * </pre>
50 *
51 * </blockquote>
52 * <p>
53 * The rule elements are defined as follows:
54 * <ul type="disc">
55 * <li><strong>Text-Argument</strong>: A text-argument is any sequence of
56 * characters, excluding special characters (that is, common whitespace
57 * characters [0009-000D, 0020] and rule syntax characters [0021-002F,
58 * 003A-0040, 005B-0060, 007B-007E]). If those characters are desired, you can
59 * put them in single quotes (for example, use '&' for ampersand). Note that
60 * unquoted white space characters are ignored; for example, {@code b c} is
61 * treated as {@code bc}.</li>
62 * <li><strong>Modifier</strong>: There is a single modifier which is used to
63 * specify that all accents (secondary differences) are backwards.
64 * <p>
65 * '@' : Indicates that accents are sorted backwards, as in French.
66 * </li>
67 * <li><strong>Relation</strong>: The relations are the following:
68 * <ul type=square>
69 * <li>'<' : Greater, as a letter difference (primary)
70 * <li>';' : Greater, as an accent difference (secondary)
71 * <li>',' : Greater, as a case difference (tertiary)
72 * <li>'=' : Equal
73 * </ul>
74 * </li>
75 * <li><strong>Reset</strong>: There is a single reset which is used primarily
76 * for contractions and expansions, but which can also be used to add a
77 * modification at the end of a set of rules.
78 * <p>
79 * '&' : Indicates that the next rule follows the position to where the reset
80 * text-argument would be sorted.
81 * </li>
82 * </ul>
83 * <p>
84 * This sounds more complicated than it is in practice. For example, the
85 * following are equivalent ways of expressing the same thing:
86 * <blockquote>
87 *
88 * <pre>
89 * a < b < c
90 * a < b & b < c
91 * a < c & a < b
92 * </pre>
93 *
94 * </blockquote>
95 * <p>
96 * Notice that the order is important, as the subsequent item goes immediately
97 * after the text-argument. The following are not equivalent:
98 * <blockquote>
99 *
100 * <pre>
101 * a < b & a < c
102 * a < c & a < b
103 * </pre>
104 *
105 * </blockquote>
106 * <p>
107 * Either the text-argument must already be present in the sequence, or some
108 * initial substring of the text-argument must be present. For example
109 * {@code "a < b & ae < e"} is valid since "a" is present in the sequence before
110 * "ae" is reset. In this latter case, "ae" is not entered and treated as a
111 * single character; instead, "e" is sorted as if it were expanded to two
112 * characters: "a" followed by an "e". This difference appears in natural
113 * languages: in traditional Spanish "ch" is treated as if it contracts to a
114 * single character (expressed as {@code "c < ch < d"}), while in traditional
115 * German a-umlaut is treated as if it expands to two characters (expressed as
116 * {@code "a,A < b,B  ... & ae;\u00e3 & AE;\u00c3"}, where \u00e3 and \u00c3
117 * are the escape sequences for a-umlaut).
118 * <h4>Ignorable Characters</h4>
119 * <p>
120 * For ignorable characters, the first rule must start with a relation (the
121 * examples we have used above are really fragments; {@code "a < b"} really
122 * should be {@code "< a < b"}). If, however, the first relation is not
123 * {@code "<"}, then all text-arguments up to the first {@code "<"} are
124 * ignorable. For example, {@code ", - < a < b"} makes {@code "-"} an ignorable
125 * character.
126 * <h4>Normalization and Accents</h4>
127 * <p>
128 * {@code RuleBasedCollator} automatically processes its rule table to include
129 * both pre-composed and combining-character versions of accented characters.
130 * Even if the provided rule string contains only base characters and separate
131 * combining accent characters, the pre-composed accented characters matching
132 * all canonical combinations of characters from the rule string will be entered
133 * in the table.
134 * <p>
135 * This allows you to use a RuleBasedCollator to compare accented strings even
136 * when the collator is set to NO_DECOMPOSITION. However, if the strings to be
137 * collated contain combining sequences that may not be in canonical order, you
138 * should set the collator to CANONICAL_DECOMPOSITION to enable sorting of
139 * combining sequences. For more information, see <a
140 * href="http://www.aw.com/devpress">The Unicode Standard, Version 3.0</a>.
141 * <h4>Errors</h4>
142 * <p>
143 * The following rules are not valid:
144 * <ul type="disc">
145 * <li>A text-argument contains unquoted punctuation symbols, for example
146 * {@code "a < b-c < d"}.</li>
147 * <li>A relation or reset character is not followed by a text-argument, for
148 * example {@code "a < , b"}.</li>
149 * <li>A reset where the text-argument (or an initial substring of the
150 * text-argument) is not already in the sequence or allocated in the default UCA
151 * table, for example {@code "a < b & e < f"}.</li>
152 * </ul>
153 * <p>
154 * If you produce one of these errors, {@code RuleBasedCollator} throws a
155 * {@code ParseException}.
156 * <h4>Examples</h4>
157 * <p>
158 * Normally, to create a rule-based collator object, you will use
159 * {@code Collator}'s factory method {@code getInstance}. However, to create a
160 * rule-based collator object with specialized rules tailored to your needs, you
161 * construct the {@code RuleBasedCollator} with the rules contained in a
162 * {@code String} object. For example:
163 * <blockquote>
164 *
165 * <pre>
166 * String Simple = "< a < b < c < d";
167 *
168 * RuleBasedCollator mySimple = new RuleBasedCollator(Simple);
169 * </pre>
170 *
171 * </blockquote>
172 * <p>
173 * Or:
174 * <blockquote>
175 *
176 * <pre>
177 * String Norwegian = "< a,A< b,B< c,C< d,D< e,E< f,F< g,G< h,H< i,I"
178 *         + "< j,J< k,K< l,L< m,M< n,N< o,O< p,P< q,Q< r,R"
179 *         + "< s,S< t,T< u,U< v,V< w,W< x,X< y,Y< z,Z"
180 *         + "< \u00E5=a\u030A,\u00C5=A\u030A"
181 *         + ";aa,AA< \u00E6,\u00C6< \u00F8,\u00D8";
182 *
183 * RuleBasedCollator myNorwegian = new RuleBasedCollator(Norwegian);
184 * </pre>
185 *
186 * </blockquote>
187 * <p>
188 * Combining {@code Collator}s is as simple as concatenating strings. Here is
189 * an example that combines two {@code Collator}s from two different locales:
190 * <blockquote>
191 *
192 * <pre>
193 * // Create an en_US Collator object
194 * RuleBasedCollator en_USCollator = (RuleBasedCollator)Collator
195 *         .getInstance(new Locale("en", "US", ""));
196 *
197 * // Create a da_DK Collator object
198 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)Collator
199 *         .getInstance(new Locale("da", "DK", ""));
200 *
201 * // Combine the two collators
202 * // First, get the collation rules from en_USCollator
203 * String en_USRules = en_USCollator.getRules();
204 *
205 * // Second, get the collation rules from da_DKCollator
206 * String da_DKRules = da_DKCollator.getRules();
207 *
208 * RuleBasedCollator newCollator = new RuleBasedCollator(en_USRules + da_DKRules);
209 * // newCollator has the combined rules
210 * </pre>
211 *
212 * </blockquote>
213 * <p>
214 * The next example shows to make changes on an existing table to create a new
215 * {@code Collator} object. For example, add {@code "& C < ch, cH, Ch, CH"} to
216 * the {@code en_USCollator} object to create your own:
217 * <blockquote>
218 *
219 * <pre>
220 * // Create a new Collator object with additional rules
221 * String addRules = "& C < ch, cH, Ch, CH";
222 *
223 * RuleBasedCollator myCollator = new RuleBasedCollator(en_USCollator + addRules);
224 * // myCollator contains the new rules
225 * </pre>
226 *
227 * </blockquote>
228 * <p>
229 * The following example demonstrates how to change the order of non-spacing
230 * accents:
231 * <blockquote>
232 *
233 * <pre>
234 * // old rule
235 * String oldRules = "= \u00a8 ; \u00af ; \u00bf" + "< a , A ; ae, AE ; \u00e6 , \u00c6"
236 *         + "< b , B < c, C < e, E & C < d, D";
237 *
238 * // change the order of accent characters
239 * String addOn = "& \u00bf ; \u00af ; \u00a8;";
240 *
241 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
242 * </pre>
243 *
244 * </blockquote>
245 * <p>
246 * The last example shows how to put new primary ordering in before the default
247 * setting. For example, in the Japanese {@code Collator}, you can either sort
248 * English characters before or after Japanese characters:
249 * <blockquote>
250 *
251 * <pre>
252 * // get en_US Collator rules
253 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
254 *     Collator.getInstance(Locale.US);
255 *
256 * // add a few Japanese character to sort before English characters
257 * // suppose the last character before the first base letter 'a' in
258 * // the English collation rule is \u30A2
259 * String jaString = "& \u30A2 , \u30FC < \u30C8";
260 *
261 * RuleBasedCollator myJapaneseCollator =
262 *     new RuleBasedCollator(en_USCollator.getRules() + jaString);
263 * </pre>
264 *
265 * </blockquote>
266 */
267public class RuleBasedCollator extends Collator {
268
269    RuleBasedCollator(com.ibm.icu4jni.text.Collator wrapper) {
270        super(wrapper);
271    }
272
273    /**
274     * Constructs a new instance of {@code RuleBasedCollator} using the
275     * specified {@code rules}. The {@code rules} are usually either
276     * hand-written based on the {@link RuleBasedCollator class description} or
277     * the result of a former {@link #getRules()} call.
278     * <p>
279     * Note that the {@code rules} are actually interpreted as a delta to the
280     * standard Unicode Collation Algorithm (UCA). Hence, an empty {@code rules}
281     * string results in the default UCA rules being applied. This differs
282     * slightly from other implementations which work with full {@code rules}
283     * specifications and may result in different behavior.
284     *
285     * @param rules
286     *            the collation rules.
287     * @throws NullPointerException
288     *             if {@code rules} is {@code null}.
289     * @throws ParseException
290     *             if {@code rules} contains rules with invalid collation rule
291     *             syntax.
292     */
293    public RuleBasedCollator(String rules) throws ParseException {
294        if (rules == null) {
295            throw new NullPointerException();
296        }
297        // BEGIN android-removed
298        // if (rules.length() == 0) {
299        //     // text.06=Build rules empty
300        //     throw new ParseException(Messages.getString("text.06"), 0); //$NON-NLS-1$
301        // }
302        // END andriod-removed
303
304        try {
305            this.icuColl = new com.ibm.icu4jni.text.RuleBasedCollator(rules);
306            // BEGIN android-added
307            this.icuColl.setDecomposition(
308                    com.ibm.icu4jni.text.Collator.CANONICAL_DECOMPOSITION);
309            // END android-added
310        } catch (Exception e) {
311            if (e instanceof ParseException) {
312                throw (ParseException) e;
313            }
314            /*
315             * -1 means it's not a ParseException. Maybe IOException thrown when
316             * an error occured while reading internal data.
317             */
318            throw new ParseException(e.getMessage(), -1);
319        }
320    }
321
322    /**
323     * Obtains a {@code CollationElementIterator} for the given
324     * {@code CharacterIterator}. The source iterator's integrity will be
325     * preserved since a new copy will be created for use.
326     *
327     * @param source
328     *            the source character iterator.
329     * @return a {@code CollationElementIterator} for {@code source}.
330     */
331    public CollationElementIterator getCollationElementIterator(
332            CharacterIterator source) {
333        if (source == null) {
334            throw new NullPointerException();
335        }
336        return new CollationElementIterator(
337                ((com.ibm.icu4jni.text.RuleBasedCollator) this.icuColl)
338                        .getCollationElementIterator(source));
339    }
340
341    /**
342     * Obtains a {@code CollationElementIterator} for the given string.
343     *
344     * @param source
345     *            the source string.
346     * @return the {@code CollationElementIterator} for {@code source}.
347     */
348    public CollationElementIterator getCollationElementIterator(String source) {
349        if (source == null) {
350            throw new NullPointerException();
351        }
352        return new CollationElementIterator(
353                ((com.ibm.icu4jni.text.RuleBasedCollator) this.icuColl)
354                        .getCollationElementIterator(source));
355    }
356
357    /**
358     * Returns the collation rules of this collator. These {@code rules} can be
359     * fed into the {@code RuleBasedCollator(String)} constructor.
360     * <p>
361     * Note that the {@code rules} are actually interpreted as a delta to the
362     * standard Unicode Collation Algorithm (UCA). Hence, an empty {@code rules}
363     * string results in the default UCA rules being applied. This differs
364     * slightly from other implementations which work with full {@code rules}
365     * specifications and may result in different behavior.
366     *
367     * @return the collation rules.
368     */
369    public String getRules() {
370        return ((com.ibm.icu4jni.text.RuleBasedCollator) this.icuColl).getRules();
371    }
372
373    /**
374     * Returns a new collator with the same collation rules, decomposition mode and
375     * strength value as this collator.
376     *
377     * @return a shallow copy of this collator.
378     * @see java.lang.Cloneable
379     */
380    @Override
381    public Object clone() {
382        RuleBasedCollator clone = (RuleBasedCollator) super.clone();
383        return clone;
384    }
385
386    /**
387     * Compares the {@code source} text to the {@code target} text according to
388     * the collation rules, strength and decomposition mode for this
389     * {@code RuleBasedCollator}. See the {@code Collator} class description
390     * for an example of use.
391     * <p>
392     * General recommendation: If comparisons are to be done with the same strings
393     * multiple times, it is more efficient to generate {@code CollationKey}
394     * objects for the strings and use
395     * {@code CollationKey.compareTo(CollationKey)} for the comparisons. If each
396     * string is compared to only once, using
397     * {@code RuleBasedCollator.compare(String, String)} has better performance.
398     *
399     * @param source
400     *            the source text.
401     * @param target
402     *            the target text.
403     * @return an integer which may be a negative value, zero, or else a
404     *         positive value depending on whether {@code source} is less than,
405     *         equivalent to, or greater than {@code target}.
406     */
407    @Override
408    public int compare(String source, String target) {
409        if (source == null || target == null) {
410            // text.08=one of arguments is null
411            throw new NullPointerException(Messages.getString("text.08")); //$NON-NLS-1$
412        }
413        return this.icuColl.compare(source, target);
414    }
415
416    /**
417     * Returns the {@code CollationKey} for the given source text.
418     *
419     * @param source
420     *            the specified source text.
421     * @return the {@code CollationKey} for the given source text.
422     */
423    @Override
424    public CollationKey getCollationKey(String source) {
425        com.ibm.icu4jni.text.CollationKey icuKey = this.icuColl
426                .getCollationKey(source);
427        if (icuKey == null) {
428            return null;
429        }
430        return new CollationKey(source, icuKey);
431    }
432
433    @Override
434    public int hashCode() {
435        return ((com.ibm.icu4jni.text.RuleBasedCollator) this.icuColl).getRules()
436                .hashCode();
437    }
438
439    /**
440     * Compares the specified object with this {@code RuleBasedCollator} and
441     * indicates if they are equal. In order to be equal, {@code object} must be
442     * an instance of {@code Collator} with the same collation rules and the
443     * same attributes.
444     *
445     * @param obj
446     *            the object to compare with this object.
447     * @return {@code true} if the specified object is equal to this
448     *         {@code RuleBasedCollator}; {@code false} otherwise.
449     * @see #hashCode
450     */
451    @Override
452    public boolean equals(Object obj) {
453        if (!(obj instanceof Collator)) {
454            return false;
455        }
456        return super.equals(obj);
457    }
458}
459