12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
5bee65486a185907111f3be60992433e133ec0e32Scott Russell * Copyright (C) 2001-2016, International Business Machines Corporation and    *
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved.                                                *
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code>UnicodeMatcher</code> defines a protocol for objects that can
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * match a range of characters in a Replaceable string.
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic interface UnicodeMatcher {
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Constant returned by <code>matches()</code> indicating a
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * mismatch between the text and this matcher.  The text contains
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * a character which does not match, or the text does not contain
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * all desired characters for a non-incremental match.
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final int U_MISMATCH = 0;
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Constant returned by <code>matches()</code> indicating a
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * partial match between the text and this matcher.  This value is
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * only returned for incremental match operations.  All characters
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * of the text match, but more characters are required for a
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * complete match.  Alternatively, for variable-length matchers,
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * all characters of the text match, and if more characters were
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * supplied at limit, they might also match.
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final int U_PARTIAL_MATCH = 1;
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Constant returned by <code>matches()</code> indicating a
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * complete match between the text and this matcher.  For an
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * incremental variable-length match, this value is returned if
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the given text matches, and it is known that additional
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * characters would not alter the extent of the match.
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final int U_MATCH = 2;
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
50bee65486a185907111f3be60992433e133ec0e32Scott Russell     * The character at index i, where i &lt; contextStart || i &gt;= contextLimit,
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * is ETHER.  This allows explicit matching by rules and UnicodeSets
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * of text outside the context.  In traditional terms, this allows anchoring
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * at the start and/or end.
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static final char ETHER = '\uFFFF';
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Return a UMatchDegree value indicating the degree of match for
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the given text at the given offset.  Zero, one, or more
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * characters may be matched.
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
63bee65486a185907111f3be60992433e133ec0e32Scott Russell     * Matching in the forward direction is indicated by limit &gt;
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * offset.  Characters from offset forwards to limit-1 will be
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * considered for matching.
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
67bee65486a185907111f3be60992433e133ec0e32Scott Russell     * Matching in the reverse direction is indicated by limit &lt;
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * offset.  Characters from offset backwards to limit+1 will be
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * considered for matching.
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * If limit == offset then the only match possible is a zero
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * character match (which subclasses may implement if desired).
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * If U_MATCH is returned, then as a side effect, advance the
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * offset parameter to the limit of the matched substring.  In the
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * forward direction, this will be the index of the last matched
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * character plus one.  In the reverse direction, this will be the
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * index of the last matched character minus one.
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param text the text to be matched
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param offset on input, the index into text at which to begin
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * matching.  On output, the limit of the matched text.  The
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * number of matched characters is the output value of offset
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * minus the input value.  Offset should always point to the
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * both on entry and upon return.
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param limit the limit index of text to be matched.  Greater
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * than offset for a forward direction match, less than offset for
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * a backward direction match.  The last character to be
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * considered for matching will be text.charAt(limit-1) in the
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * forward direction or text.charAt(limit+1) in the backward
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * direction.
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param incremental if TRUE, then assume further characters may
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * be inserted at limit and check for partial matching.  Otherwise
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * assume the text as given is complete.
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return a match degree value indicating a full match, a partial
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * match, or a mismatch.  If incremental is FALSE then
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * U_PARTIAL_MATCH should never be returned.
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract int matches(Replaceable text,
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                int[] offset,
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                int limit,
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                boolean incremental);
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns a string representation of this matcher.  If the result of
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * calling this function is passed to the appropriate parser, it
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * will produce another matcher that is equal to this one.
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param escapeUnprintable if TRUE then convert unprintable
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * character to their hex escape representations, \\uxxxx or
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * \\Uxxxxxxxx.  Unprintable characters are those other than
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * U+000A, U+0020..U+007E.
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract String toPattern(boolean escapeUnprintable);
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns TRUE if this matcher will match a character c, where c
120bee65486a185907111f3be60992433e133ec0e32Scott Russell     * &amp; 0xFF == v, at offset, in the forward direction (with limit &gt;
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * indexing.
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * <p>Note:  This API uses an int even though the value will be
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * restricted to 8 bits in order to avoid complications with
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * signedness (bytes convert to ints in the range -128..127).
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.0
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract boolean matchesIndexValue(int v);
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Union the set of all characters that may be matched by this object
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * into the given set.
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param toUnionTo the set into which to union the source characters
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.2
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract void addMatchSetTo(UnicodeSet toUnionTo);
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//eof
141