12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 5bee65486a185907111f3be60992433e133ec0e32Scott Russell * Copyright (C) 1996-2016, International Business Machines Corporation and 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.CharacterIterator; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.StringCharacterIterator; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Locale; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ICUException; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ULocale; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// Java porting note: 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// The ICU4C implementation contains dead code in many places. 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// While porting the ICU4C linear search implementation, this dead code 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***" 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// are those dead code blocks, still available in ICU4C. 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// The ICU4C implementation does not seem to handle UCharacterIterator pointing 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// to a fragment of text properly. ICU4J uses CharacterIterator to navigate through 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// the input text. We need to carefully review the code ported from ICU4C 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// assuming the start index is 0. 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// CE is no longer used, except in a few places checking CELength. It looks like this 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// is a leftover from already-disabled Boyer-Moore search code. This Java implementation 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// preserves the code, but we should clean this up later. 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>StringSearch</tt> is a {@link SearchIterator} that provides 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * language-sensitive text searching based on the comparison rules defined 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in a {@link RuleBasedCollator} object. 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * StringSearch ensures that language eccentricity can be 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * handled, e.g. for the German collator, characters ß and SS will be matched 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if case is chosen to be ignored. 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * "ICU Collation Design Document"</a> for more information. 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * There are 2 match options for selection:<br> 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Let S' be the sub-string of a text string S between the offsets start and 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * end [start, end]. 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <br> 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A pattern string P matches a text string S at the offsets [start, end] 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * option 1. Some canonical equivalent of P matches some canonical equivalent 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of S' 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * option 2. P matches S' and if P starts or ends with a combining mark, 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * there exists no non-ignorable combining mark before or after S? 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in S respectively. 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </pre> 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option 2. is the default. 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This search has APIs similar to that of other text iteration mechanisms 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * such as the break iterators in {@link BreakIterator}. Using these 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * APIs, it is easy to scan through text looking for all occurrences of 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a given pattern. This search iterator allows changing of direction by 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * calling a {@link #reset} followed by a {@link #next} or {@link #previous}. 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Though a direction change can occur without calling {@link #reset} first, 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this operation comes with some speed penalty. 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Match results in the forward direction will match the result matches in 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the backwards direction in the reverse order 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link SearchIterator} provides APIs to specify the starting position 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex}, 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}. 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Since the starting position will be set as it is specified, please take note that 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * there are some danger points at which the search may render incorrect 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * results: 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <ul> 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li> In the midst of a substring that requires normalization. 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li> If the following match is to be found, the position should not be the 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * second character which requires swapping with the preceding 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * character. Vice versa, if the preceding match is to be found, the 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * position to search from should not be the first character which 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * requires swapping with the next character. E.g certain Thai and 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Lao characters require swapping. 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li> If a following pattern match is to be found, any position within a 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * contracting sequence except the first will fail. Vice versa if a 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * preceding pattern match is to be found, an invalid starting point 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * would be any character within a contracting sequence except the last. 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </ul> 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A {@link BreakIterator} can be used if only matches at logical breaks are desired. 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Using a {@link BreakIterator} will only give you results that exactly matches the 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * not be found in the string "\u00e9" if a character break iterator is used. 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Options are provided to handle overlapping matches. 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * E.g. In English, overlapping matches produces the result 0 and 2 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * for the pattern "abab" in the text "ababab", where mutually 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * exclusive matches only produces the result of 0. 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Options are also provided to implement "asymmetric search" as described in 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * values. 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Though collator attributes will be taken into consideration while 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * performing matches, there are no APIs here for setting and getting the 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * attributes. These attributes can be set by getting the collator 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}. 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Lastly to update <tt>StringSearch</tt> to the new collator attributes, 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #reset} has to be called. 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Restriction: <br> 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Currently there are no composite characters that consists of a 115bee65486a185907111f3be60992433e133ec0e32Scott Russell * character with combining class > 0 before a character with combining 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * class == 0. However, if such a character exists in the future, 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>StringSearch</tt> does not guarantee the results for option 1. 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Consult the {@link SearchIterator} documentation for information on 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and examples of how to use instances of this class to implement text 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * searching. 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note, <tt>StringSearch</tt> is not to be subclassed. 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </p> 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see SearchIterator 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see RuleBasedCollator 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author Laura Werner, synwee 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// internal notes: all methods do not guarantee the correct status of the 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// characteriterator. the caller has to maintain the original index position 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// if necessary. methods could change the index position as it deems fit 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class StringSearch extends SearchIterator { 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Pattern pattern_; 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private RuleBasedCollator collator_; 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // positions within the collation element iterator is used to determine 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we are at the start of the text. 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationElementIterator textIter_; 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationPCE textProcessedIter_; 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // utility collation element, used throughout program for temporary 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // iteration. 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationElementIterator utilIter_; 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert private Normalizer2 nfd_; 148bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int strength_; 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ceMask_; 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int variableTop_; 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean toShift_; 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private char[] canonicalPrefixAccents_; 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private char[] canonicalSuffixAccents_; 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializes the iterator to use the language-specific rules defined in 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the argument collator to search for argument pattern in the argument 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * target text. The argument <code>breakiter</code> is used to define logical matches. 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * See super class documentation for more details on the use of the target 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * text and {@link BreakIterator}. 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param pattern text to look for. 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target target text to search for pattern. 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param collator {@link RuleBasedCollator} that defines the language rules 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param breakiter A {@link BreakIterator} that is used to determine the 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * boundaries of a logical match. This argument can be null. 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException thrown when argument target is null, 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or of length 0 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see BreakIterator 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see RuleBasedCollator 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BreakIterator breakiter) { 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This implementation is ported from ICU4C usearch_open() 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super(target, breakiter); 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // string search does not really work when numeric collation is turned on 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (collator.getNumericCollation()) { 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch"); 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert collator_ = collator; 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength_ = collator.getStrength(); 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceMask_ = getMask(strength_); 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert toShift_ = collator.isAlternateHandlingShifted(); 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert variableTop_ = collator.getVariableTop(); 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 194bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert nfd_ = Normalizer2.getNFDInstance(); 195bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_ = new Pattern(pattern); 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.setMatchedLength(0); 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = DONE; 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert utilIter_ = null; 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_ = new CollationElementIterator(target, collator); 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textProcessedIter_ = null; 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This is done by super class constructor 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isOverlap_ = false; 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isCanonicalMatch_ = false; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isForwardSearching_ = true; 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.reset_ = true; 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initialize(); 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializes the iterator to use the language-specific rules defined in 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the argument collator to search for argument pattern in the argument 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * target text. No {@link BreakIterator}s are set to test for logical matches. 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param pattern text to look for. 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target target text to search for pattern. 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param collator {@link RuleBasedCollator} that defines the language rules 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException thrown when argument target is null, 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or of length 0 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see RuleBasedCollator 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) { 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this(pattern, target, collator, null); 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializes the iterator to use the language-specific rules and 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * break iterator rules defined in the argument locale to search for 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * argument pattern in the argument target text. 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param pattern text to look for. 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target target text to search for pattern. 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param locale locale to use for language and break iterator rules 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException thrown when argument target is null, 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or of length 0. ClassCastException thrown if the collator for 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the specified locale is not a RuleBasedCollator. 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringSearch(String pattern, CharacterIterator target, Locale locale) { 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this(pattern, target, ULocale.forLocale(locale)); 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializes the iterator to use the language-specific rules and 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * break iterator rules defined in the argument locale to search for 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * argument pattern in the argument target text. 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * See super class documentation for more details on the use of the target 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * text and {@link BreakIterator}. 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param pattern text to look for. 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target target text to search for pattern. 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param locale locale to use for language and break iterator rules 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException thrown when argument target is null, 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or of length 0. ClassCastException thrown if the collator for 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the specified locale is not a RuleBasedCollator. 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see BreakIterator 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see RuleBasedCollator 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see SearchIterator 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 3.2 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringSearch(String pattern, CharacterIterator target, ULocale locale) { 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null); 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializes the iterator to use the language-specific rules and 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * break iterator rules defined in the default locale to search for 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * argument pattern in the argument target text. 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param pattern text to look for. 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target target text to search for pattern. 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException thrown when argument target is null, 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or of length 0. ClassCastException thrown if the collator for 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the default locale is not a RuleBasedCollator. 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringSearch(String pattern, String target) { 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this(pattern, new StringCharacterIterator(target), 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (RuleBasedCollator) Collator.getInstance(), null); 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the {@link RuleBasedCollator} used for the language rules. 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * changes to the {@link RuleBasedCollator} result should follow with a call to 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * search behavior. 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </p> 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt> 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see RuleBasedCollator 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setCollator 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public RuleBasedCollator getCollator() { 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return collator_; 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the {@link RuleBasedCollator} to be used for language-specific searching. 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iterator's position will not be changed by this method. 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param collator to use for this <tt>StringSearch</tt> 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException thrown when collator is null 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #getCollator 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setCollator(RuleBasedCollator collator) { 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (collator == null) { 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("Collator can not be null"); 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert collator_ = collator; 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceMask_ = getMask(collator_.getStrength()); 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert toShift_ = collator.isAlternateHandlingShifted(); 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert variableTop_ = collator.getVariableTop(); 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_ = new CollationElementIterator(pattern_.text_, collator); 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert utilIter_ = new CollationElementIterator(pattern_.text_, collator); 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // initialize() _after_ setting the iterators for the new collator. 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initialize(); 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the pattern for which <tt>StringSearch</tt> is searching for. 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the pattern searched for 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String getPattern() { 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pattern_.text_; 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the pattern to search for. 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iterator's position will not be changed by this method. 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param pattern for searching 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #getPattern 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IllegalArgumentException thrown if pattern is null or of 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * length 0 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setPattern(String pattern) { 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern == null || pattern.length() <= 0) { 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "Pattern to search for can not be null or of length 0"); 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.text_ = pattern; 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initialize(); 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Determines whether canonical matches (option 1, as described in the 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * class documentation) is set. 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * See setCanonical(boolean) for more information. 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setCanonical 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true if canonical matches is set, false otherwise 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //TODO: hoist this to SearchIterator 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isCanonical() { 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return search_.isCanonicalMatch_; 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the canonical match mode. See class documentation for details. 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The default setting for this property is false. 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param allowCanonical flag indicator if canonical matches are allowed 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #isCanonical 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //TODO: hoist this to SearchIterator 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setCanonical(boolean allowCanonical) { 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isCanonicalMatch_ = allowCanonical; 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@inheritDoc} 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setTarget(CharacterIterator text) { 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super.setTarget(text); 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setText(text); 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@inheritDoc} 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getIndex() { 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = textIter_.getOffset(); 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) { 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return DONE; 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@inheritDoc} 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setIndex(int position) { 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: This method is equivalent to setOffset() in ICU4C. 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ICU4C SearchIterator::setOffset() is a pure virtual method, while 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ICU4J SearchIterator.setIndex() is not abstract method. 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super.setIndex(position); 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(position); 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@inheritDoc} 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void reset() { 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // reset is setting the attributes that are already in 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // string search, hence all attributes in the collator should 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // be retrieved without any problems 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean sameCollAttribute = true; 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ceMask; 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean shift; 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int varTop; 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** hack to deal w/ how processed CEs encode quaternary **** 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newStrength = collator_.getStrength(); 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY) 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) { 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sameCollAttribute = false; 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength_ = collator_.getStrength(); 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceMask = getMask(strength_); 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceMask_ != ceMask) { 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceMask_ = ceMask; 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sameCollAttribute = false; 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert shift = collator_.isAlternateHandlingShifted(); 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (toShift_ != shift) { 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert toShift_ = shift; 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sameCollAttribute = false; 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert varTop = collator_.getVariableTop(); 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (variableTop_ != varTop) { 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert variableTop_ = varTop; 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sameCollAttribute = false; 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!sameCollAttribute) { 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initialize(); 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setText(search_.text()); 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.setMatchedLength(0); 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = DONE; 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isOverlap_ = false; 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isCanonicalMatch_ = false; 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.isForwardSearching_ = true; 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.reset_ = true; 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@inheritDoc} 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected int handleNext(int position) { 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.CELength_ == 0) { 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = search_.matchedIndex_ == DONE ? 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getIndex() : search_.matchedIndex_ + 1; 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.setMatchedLength(0); 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(search_.matchedIndex_); 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.matchedIndex_ == search_.endIndex()) { 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = DONE; 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.matchedLength() <= 0) { 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the flipping direction issue has already been handled 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in next() 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for boundary check purposes. this will ensure that the 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // next match will not preceed the current offset 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // note search_.matchedIndex_ will always be set to something 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in the code 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = position - 1; 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(position); 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ICU4C comment: 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if strsrch_->breakIter is always the same as m_breakiterator_ 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then we don't need to check the match boundaries here because 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // usearch_handleNextXXX will already have done it. 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.isCanonicalMatch_) { 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *could* actually use exact here 'cause no extra accents allowed... 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handleNextCanonical(); 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handleNextExact(); 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.matchedIndex_ == DONE) { 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(search_.endIndex()); 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(search_.matchedIndex_); 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return search_.matchedIndex_; 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return DONE; 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@inheritDoc} 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected int handlePrevious(int position) { 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.CELength_ == 0) { 5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_; 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.matchedIndex_ == search_.beginIndex()) { 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setMatchNotFound(); 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_--; 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(search_.matchedIndex_); 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.setMatchedLength(0); 5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(position); 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.isCanonicalMatch_) { 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *could* use exact match here since extra accents *not* allowed! 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handlePreviousCanonical(); 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handlePreviousExact(); 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return search_.matchedIndex_; 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ------------------ Internal implementation code --------------------------- 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int INITIAL_ARRAY_SIZE_ = 256; 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl; 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private static final int LAST_BYTE_MASK_ = 0xff; 5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private static final int SECOND_LAST_BYTE_SHIFT_ = 8; 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int PRIMARYORDERMASK = 0xffff0000; 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int SECONDARYORDERMASK = 0x0000ff00; 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int TERTIARYORDERMASK = 0x000000ff; 5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Getting the mask for collation strength 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param strength collation strength 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return collation element mask 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int getMask(int strength) { 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (strength) { 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collator.PRIMARY: 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return PRIMARYORDERMASK; 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collator.SECONDARY: 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return SECONDARYORDERMASK | PRIMARYORDERMASK; 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK; 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final char getFCD(String str, int offset) { 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char ch = str.charAt(offset); 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ch < 0x180) { 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (char) nfcImpl_.getFCD16FromBelow180(ch); 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { 5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!Character.isHighSurrogate(ch)) { 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (char) nfcImpl_.getFCD16FromNormData(ch); 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c2; 6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final char getFCD(int c) { 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (char)nfcImpl_.getFCD16(c); 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Getting the modified collation elements taking into account the collation 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * attributes. 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param sourcece 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the modified collation element 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int getCE(int sourcece) { 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // note for tertiary we can't use the collator->tertiaryMask, that 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // is a preprocessed mask that takes into account case options. since 6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we are only concerned with exact matches, we don't need that. 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourcece &= ceMask_; 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (toShift_) { 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // alternate handling here, since only the 16 most significant digits 6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // is only used, we can safely do a compare without masking 6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if the ce is a variable, we mask and get only the primary values 6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // no shifting to quartenary is required since all primary values 6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // less than variabletop will need to be masked off anyway. 6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (variableTop_ > sourcece) { 6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (strength_ >= Collator.QUATERNARY) { 6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourcece &= PRIMARYORDERMASK; 6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourcece = CollationElementIterator.IGNORABLE; 6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) { 6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourcece = 0xFFFF; 6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return sourcece; 6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp. 6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * implement this in Pattern class. 6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destination target array 6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param offset destination offset to add value 6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destinationlength target array size 6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param value to be added 6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param increments incremental size expected 6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return new destination array, destination if there was no new allocation 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int[] addToIntArray(int[] destination, int offset, int destinationlength, 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int value, int increments) { 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newlength = destinationlength; 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (offset + 1 == newlength) { 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newlength += increments; 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int temp[] = new int[newlength]; 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.arraycopy(destination, 0, temp, 0, offset); 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destination = temp; 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destination[offset] = value; 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return destination; 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp. 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * implement this in Pattern class. 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destination target array 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param offset destination offset to add value 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destinationlength target array size 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param value to be added 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param increments incremental size expected 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return new destination array, destination if there was no new allocation 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long[] addToLongArray(long[] destination, int offset, int destinationlength, 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long value, int increments) { 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newlength = destinationlength; 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (offset + 1 == newlength) { 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newlength += increments; 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long temp[] = new long[newlength]; 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.arraycopy(destination, 0, temp, 0, offset); 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destination = temp; 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destination[offset] = value; 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return destination; 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializing the ce table for a pattern. 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Stores non-ignorable collation keys. 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Table size will be estimated by the size of the pattern text. Table 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * expansion will be perform as we go along. Adding 1 to ensure that the table 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * size definitely increases. 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return total number of expansions 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: We probably do not need Pattern CE table. 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int initializePatternCETable() { 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] cetable = new int[INITIAL_ARRAY_SIZE_]; 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cetablesize = cetable.length; 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patternlength = pattern_.text_.length(); 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationElementIterator coleiter = utilIter_; 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (coleiter == null) { 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert coleiter = new CollationElementIterator(pattern_.text_, collator_); 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert utilIter_ = coleiter; 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert coleiter.setText(pattern_.text_); 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int offset = 0; 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = 0; 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce; 7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) { 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newce = getCE(ce); 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newce != CollationElementIterator.IGNORABLE /* 0 */) { 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] temp = addToIntArray(cetable, offset, cetablesize, newce, 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert patternlength - coleiter.getOffset() + 1); 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset++; 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cetable = temp; 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result += (coleiter.getMaxExpansion(ce) - 1); 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cetable[offset] = 0; 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.CE_ = cetable; 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.CELength_ = offset; 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Initializing the pce table for a pattern. 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Stores non-ignorable collation keys. 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Table size will be estimated by the size of the pattern text. Table 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * expansion will be perform as we go along. Adding 1 to ensure that the table 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * size definitely increases. 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return total number of expansions 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int initializePatternPCETable() { 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] pcetable = new long[INITIAL_ARRAY_SIZE_]; 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pcetablesize = pcetable.length; 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patternlength = pattern_.text_.length(); 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationElementIterator coleiter = utilIter_; 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (coleiter == null) { 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert coleiter = new CollationElementIterator(pattern_.text_, collator_); 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert utilIter_ = coleiter; 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert coleiter.setText(pattern_.text_); 7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int offset = 0; 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = 0; 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long pce; 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationPCE iter = new CollationPCE(coleiter); 7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ** Should processed CEs be signed or unsigned? 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ** (the rest of the code in this file seems to play fast-and-loose with 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) { 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1); 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset++; 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pcetable = temp; 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pcetable[offset] = 0; 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.PCE_ = pcetable; 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.PCELength_ = offset; 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: This method only triggers initializePatternCETable(), which is probably no 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // longer needed. 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int initializePattern() { 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Since the strength is primary, accents are ignored in the pattern. 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (strength_ == Collator.PRIMARY) { 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.hasPrefixAccents_ = false; 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.hasSuffixAccents_ = false; 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0; 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0; 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.PCE_ = null; 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // since intializePattern is an internal method status is a success. 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return initializePatternCETable(); 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final void setShiftTable(char shift[], 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char backshift[], 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cetable[], int cesize, 8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int expansionsize, 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int defaultforward, 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int defaultbackward) { 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No implementation 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: This method only triggers initializePattern(), which is probably no 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // longer needed. 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void initialize() { 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* int expandlength = */ initializePattern(); 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.CELength_ > 0) { 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cesize = pattern_.CELength_; 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int minlength = cesize > expandlength ? cesize - expandlength : 1; 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pattern_.defaultShiftSize_ = minlength; 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize, 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert expandlength, minlength, minlength); 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pattern_.defaultShiftSize_; 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated This API is ICU internal only. 8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected void setMatchNotFound() { 8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super.setMatchNotFound(); 8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // SearchIterator#setMatchNotFound() does following: 8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // search_.matchedIndex_ = DONE; 8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // search_.setMatchedLength(0); 8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.isForwardSearching_) { 8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(search_.text().getEndIndex()); 8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(0); 8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Checks if the offset runs out of the text string range 8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param textstart offset of the first character in the range 8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param textlimit limit offset of the text string range 8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param offset to test 8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true if offset is out of bounds, false otherwise 8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) { 8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return offset < textstart || offset > textlimit; 8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Checks for identical match 8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param start offset of possible match 8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param end offset of possible match 8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return TRUE if identical match is found 8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean checkIdentical(int start, int end) { 8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (strength_ != Collator.IDENTICAL) { 8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: We could use Normalizer::compare() or similar, but for short strings 8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // which may not be in FCD it might be faster to just NFD them. 8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String textstr = getString(targetText, start, end - start); 8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) { 8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textstr = Normalizer.decompose(textstr, false); 8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String patternstr = pattern_.text_; 8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) { 8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert patternstr = Normalizer.decompose(patternstr, false); 8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return textstr.equals(patternstr); 8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean initTextProcessedIter() { 8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (textProcessedIter_ == null) { 8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textProcessedIter_ = new CollationPCE(textIter_); 8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textProcessedIter_.init(textIter_); 8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Find the next break boundary after startIndex. If the UStringSearch object 9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * has an external break iterator, use that. Otherwise use the internal character 9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * break iterator. 9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int nextBoundaryAfter(int startIndex) { 9047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BreakIterator breakiterator = search_.breakIter(); 9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (breakiterator == null) { 9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert breakiterator = search_.internalBreakIter_; 9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (breakiterator != null) { 9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return breakiterator.following(startIndex); 9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return startIndex; 9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns TRUE if index is on a break boundary. If the UStringSearch 9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * has an external break iterator, test using that, otherwise test 9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * using the internal character break iterator. 9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isBreakBoundary(int index) { 9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BreakIterator breakiterator = search_.breakIter(); 9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (breakiterator == null) { 9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert breakiterator = search_.internalBreakIter_; 9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (breakiterator != null && breakiterator.isBoundary(index)); 9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: Followings are corresponding to UCompareCEsResult enum 9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CE_MATCH = -1; 9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CE_NO_MATCH = 0; 9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CE_SKIP_TARG = 1; 9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CE_SKIP_PATN = 2; 9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int CE_LEVEL2_BASE = 0x00000005; 9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int CE_LEVEL3_BASE = 0x00050000; 9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) { 9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targCE == patCE) { 9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_MATCH; 9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_NO_MATCH; 9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long targCEshifted = targCE >>> 32; 9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long patCEshifted = patCE >>> 32; 9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long mask; 9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mask = 0xFFFF0000L; 9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targLev1 = (int)(targCEshifted & mask); 9567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patLev1 = (int)(patCEshifted & mask); 9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targLev1 != patLev1) { 9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targLev1 == 0) { 9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_SKIP_TARG; 9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (patLev1 == 0 9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_SKIP_PATN; 9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_NO_MATCH; 9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mask = 0x0000FFFFL; 9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targLev2 = (int)(targCEshifted & mask); 9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patLev2 = (int)(patCEshifted & mask); 9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targLev2 != patLev2) { 9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targLev2 == 0) { 9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_SKIP_TARG; 9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (patLev2 == 0 9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_SKIP_PATN; 9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (patLev2 == CE_LEVEL2_BASE || 9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH; 9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mask = 0xFFFF0000L; 9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targLev3 = (int)(targCE & mask); 9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patLev3 = (int)(patCE & mask); 9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targLev3 != patLev3) { 9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (patLev3 == CE_LEVEL3_BASE || 9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH; 9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CE_MATCH; 9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * An object used for receiving matched index in search() and 9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * searchBackwards(). 9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static class Match { 10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int start_ = -1; 10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit_ = -1; 10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean search(int startIdx, Match m) { 10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Input parameter sanity check. 10077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.CELength_ == 0 10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || startIdx < search_.beginIndex() 10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || startIdx > search_.endIndex()) { 10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " + 10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.beginIndex() + " and " + search_.endIndex()); 10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.PCE_ == null) { 10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initializePatternPCETable(); 10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(startIdx); 10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEBuffer ceb = new CEBuffer(this); 10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targetIx = 0; 10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI targetCEI = null; 10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patIx; 10247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean found; 10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mStart = -1; 10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mLimit = -1; 10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int minLimit; 10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int maxLimit; 10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Outer loop moves over match starting positions in the 10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // target CE space. 10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Here we see the target as a sequence of collation elements, resulting from the following: 10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (for example, digraphs such as IJ may be broken into two characters). 10367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 10377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then the CE is deleted, so the following code sees only CEs that are relevant. 10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (targetIx = 0; ; targetIx++) { 10457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = true; 10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Inner loop checks for a match beginning at each 10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // position from the outer loop. 10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targetIxOffset = 0; 10497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long patCE = 0; 10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (compared to the last CE fetched for the previous targetIx value) as we need to go 10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI firstCEI = ceb.get(targetIx); 10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (firstCEI == null) { 10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUException("CEBuffer.get(" + targetIx + ") returned null."); 10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (patIx = 0; patIx < pattern_.PCELength_; patIx++) { 10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert patCE = pattern_.PCE_[patIx]; 10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetCEI = ceb.get(targetIx + patIx + targetIxOffset); 10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compare CE from target string with CE from the pattern. 10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // which will fail the compare, below. 10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceMatch == CE_NO_MATCH) { 10667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (ceMatch > CE_NO_MATCH) { 10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceMatch == CE_SKIP_TARG) { 10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // redo with same patCE, next targCE 10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert patIx--; 10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetIxOffset++; 10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // ceMatch == CE_SKIP_PATN 10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // redo with same targCE, next patCE 10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetIxOffset--; 10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far 10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No match at this targetIx. Try again at the next. 10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!found) { 10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No match at all, we have run off the end of the target text. 10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We have found a match in CE space. 10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Now determine the bounds in string index space. 10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There still is a chance of match failure if the CE range not correspond to 10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // an acceptable character range. 10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI lastCEI = ceb.get(targetIx + targetIxOffset -1); 10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mStart = firstCEI.lowIndex_; 10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minLimit = lastCEI.lowIndex_; 11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look at the CE following the match. If it is UCOL_NULLORDER the match 11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // extended to the end of input, and the match is good. 11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look at the high and low indices of the CE following the match. If 11057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // they are the same it means one of two things: 11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 1. The match extended to the last CE from the target text, which is OK, or 11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 2. The last CE that was part of the match is in an expansion that extends 11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to the first CE after the match. In this case, we reject the match. 11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI nextCEI = null; 11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCEI = ceb.get(targetIx + targetIxOffset); 11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert maxLimit = nextCEI.lowIndex_; 11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;; ++targetIxOffset) { 11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCEI = ceb.get(targetIx + targetIxOffset); 11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert maxLimit = nextCEI.lowIndex_; 11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we are at the end of the target too, match succeeds 11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) { 11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // As long as the next CE has primary weight of 0, 11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // it is part of the last target element matched by the pattern; 11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // make sure it can be part of a match with the last patCE 11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) { 11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_); 11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) { 11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 11317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 11347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // target element, but it has non-zero primary weight => match fails 11357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) { 11367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 11377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Else the target CE is not part of an expansion of the last matched element, match succeeds 11397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check for the start of the match being within a combining sequence. 11467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This can happen if the pattern itself begins with a combining char, and 11477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the match found combining marks in the target text that were attached 11487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to something else. 11497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This type of match should be rejected for not completely consuming a 11507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // combining sequence. 11517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!isBreakBoundary(mStart)) { 11527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 11537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check for the start of the match being within an Collation Element Expansion, 11567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // meaning that the first char of the match is only partially matched. 11577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // With expansions, the first CE will report the index of the source 11587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // character, and all subsequent (expansions) CEs will report the source index of the 11597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // _following_ character. 11607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int secondIx = firstCEI.highIndex_; 11617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (mStart == secondIx) { 11627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 11637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1165bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Allow matches to end in the middle of a grapheme cluster if the following 1166bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // conditions are met; this is needed to make prefix search work properly in 1167bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Indic, see #11750 1168bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * the default breakIter is being used 1169bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * the next collation element after this combining sequence 1170bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // - has non-zero primary weight 1171bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // - corresponds to a separate character following the one at end of the current match 1172bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // (the second of these conditions, and perhaps both, may be redundant given the 1173bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // subsequent check for normalization boundary; however they are likely much faster 1174bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // tests in any case) 1175bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * the match limit is a normalization boundary 1176bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean allowMidclusterMatch = 1177bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert breakIterator == null && 11782d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && 1179bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && 1180bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || 1181bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); 1182bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 1183bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // If those conditions are met, then: 1184bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1185bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // the match limit may be backed off to a previous break boundary. This handles 1186bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // cases in which mLimit includes target characters that are ignorable with current 1187bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // settings (such as space) and which extend beyond the pattern match. 1188bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1189bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * do NOT require that match limit be on a breakIter boundary 1190bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 11917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Advance the match end position to the first acceptable match boundary. 11927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This advances the index over any combining characters. 11937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = maxLimit; 11947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (minLimit < maxLimit) { 11957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // When the last CE's low index is same with its high index, the CE is likely 11967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // a part of expansion. In this case, the index is located just after the 11977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // character corresponding to the CEs compared above. If the index is right 11987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at the break boundary, move the position to the next boundary will result 11997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // incorrect match length when there are ignorable characters exist between 12007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the position and the next character produces CE(s). See ticket#8482. 12017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) { 12027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = minLimit; 12037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nba = nextBoundaryAfter(minLimit); 1205bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Note that we can have nba < maxLimit && nba >= minLImit, in which 1206bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // case we want to set mLimit to nba regardless of allowMidclusterMatch 1207bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // (i.e. we back off mLimit to the previous breakIterator boundary). 1208bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { 12097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = nba; 12107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1214bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (!allowMidclusterMatch) { 1215bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // If advancing to the end of a combining sequence in character indexing space 1216bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // advanced us beyond the end of the match in CE space, reject this match. 1217bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (mLimit > maxLimit) { 1218bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert found = false; 1219bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 12207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1221bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (!isBreakBoundary(mLimit)) { 1222bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert found = false; 1223bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 12247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!checkIdentical(mStart, mLimit)) { 12277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 12287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (found) { 12317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 12327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All Done. Store back the match bounds to the caller. 12367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 12377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (found == false) { 12387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = -1; 12397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mStart = -1; 12407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (m != null) { 12437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert m.start_ = mStart; 12447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert m.limit_ = mLimit; 12457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return found; 12487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1250bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert private static int codePointAt(CharacterIterator iter, int index) { 1251bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int currentIterIndex = iter.getIndex(); 1252bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert char codeUnit = iter.setIndex(index); 1253bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int cp = codeUnit; 1254bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (Character.isHighSurrogate(codeUnit)) { 1255bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert char nextUnit = iter.next(); 1256bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (Character.isLowSurrogate(nextUnit)) { 1257bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert cp = Character.toCodePoint(codeUnit, nextUnit); 1258bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 1259bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 1260bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert iter.setIndex(currentIterIndex); // restore iter position 1261bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return cp; 1262bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 1263bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 1264bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert private static int codePointBefore(CharacterIterator iter, int index) { 1265bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int currentIterIndex = iter.getIndex(); 1266bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert iter.setIndex(index); 1267bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert char codeUnit = iter.previous(); 1268bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int cp = codeUnit; 1269bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (Character.isLowSurrogate(codeUnit)) { 1270bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert char prevUnit = iter.previous(); 1271bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (Character.isHighSurrogate(prevUnit)) { 1272bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert cp = Character.toCodePoint(prevUnit, codeUnit); 1273bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 1274bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 1275bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert iter.setIndex(currentIterIndex); // restore iter position 1276bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return cp; 1277bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 1278bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 12797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean searchBackwards(int startIdx, Match m) { 12807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //ICU4C_TODO comment: reject search patterns beginning with a combining char. 12817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Input parameter sanity check. 12837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.CELength_ == 0 12847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || startIdx < search_.beginIndex() 12857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || startIdx > search_.endIndex()) { 12867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " + 12877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.beginIndex() + " and " + search_.endIndex()); 12887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern_.PCE_ == null) { 12917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initializePatternPCETable(); 12927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEBuffer ceb = new CEBuffer(this); 12957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targetIx = 0; 12967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 12987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Pre-load the buffer with the CE's for the grapheme 12997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * after our starting position so that we're sure that 13007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * we can look at the CE following the match when we 13017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * check the match boundaries. 13027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This will also pre-fetch the first CE that we'll 13047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * consider for the match. 13057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (startIdx < search_.endIndex()) { 13077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BreakIterator bi = search_.internalBreakIter_; 13087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int next = bi.following(startIdx); 13097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(next); 13117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (targetIx = 0; ; targetIx++) { 13137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) { 13147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 13157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textIter_.setOffset(startIdx); 13197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI targetCEI = null; 13227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int patIx; 13237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean found; 13247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limitIx = targetIx; 13267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mStart = -1; 13277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mLimit = -1; 13287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int minLimit; 13297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int maxLimit; 13307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Outer loop moves over match starting positions in the 13327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // target CE space. 13337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 13347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // But patIx is 0 at the beginning of the pattern and increases toward the end. 13357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 13367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and the beginning of the base text. 13377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (targetIx = limitIx; ; targetIx++) { 13387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = true; 13397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 13407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (compared to the last CE fetched for the previous targetIx value) as we need to go 13417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 13427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI lastCEI = ceb.getPrevious(targetIx); 13437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (lastCEI == null) { 13447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null."); 13457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Inner loop checks for a match beginning at each 13477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // position from the outer loop. 13487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int targetIxOffset = 0; 13497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) { 13507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long patCE = pattern_.PCE_[patIx]; 13517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset); 13537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compare CE from target string with CE from the pattern. 13547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 13557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // which will fail the compare, below. 13567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 13577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceMatch == CE_NO_MATCH) { 13587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 13597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 13607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (ceMatch > CE_NO_MATCH) { 13617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ceMatch == CE_SKIP_TARG) { 13627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // redo with same patCE, next targCE 13637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert patIx++; 13647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetIxOffset++; 13657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // ceMatch == CE_SKIP_PATN 13667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // redo with same targCE, next patCE 13677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert targetIxOffset--; 13687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 13737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No match at this targetIx. Try again at the next. 13747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 13757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!found) { 13787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No match at all, we have run off the end of the target text. 13797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 13807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We have found a match in CE space. 13837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Now determine the bounds in string index space. 13847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There still is a chance of match failure if the CE range not correspond to 13857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // an acceptable character range. 13867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 13877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset); 13887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mStart = firstCEI.lowIndex_; 13897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check for the start of the match being within a combining sequence. 13917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This can happen if the pattern itself begins with a combining char, and 13927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the match found combining marks in the target text that were attached 13937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to something else. 13947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This type of match should be rejected for not completely consuming a 13957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // combining sequence. 13967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!isBreakBoundary(mStart)) { 13977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 13987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look at the high index of the first CE in the match. If it's the same as the 14017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // low index, the first CE in the match is in the middle of an expansion. 14027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (mStart == firstCEI.highIndex_) { 14037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 14047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minLimit = lastCEI.lowIndex_; 14077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (targetIx > 0) { 14097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look at the CE following the match. If it is UCOL_NULLORDER the match 14107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // extended to the end of input, and the match is good. 14117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look at the high and low indices of the CE following the match. If 14137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // they are the same it means one of two things: 14147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 1. The match extended to the last CE from the target text, which is OK, or 14157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 2. The last CE that was part of the match is in an expansion that extends 14167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to the first CE after the match. In this case, we reject the match. 14177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI nextCEI = ceb.getPrevious(targetIx - 1); 14187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 14207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 14217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = maxLimit = nextCEI.lowIndex_; 14247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1425bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Allow matches to end in the middle of a grapheme cluster if the following 1426bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // conditions are met; this is needed to make prefix search work properly in 1427bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Indic, see #11750 1428bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * the default breakIter is being used 1429bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * the next collation element after this combining sequence 1430bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // - has non-zero primary weight 1431bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // - corresponds to a separate character following the one at end of the current match 1432bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // (the second of these conditions, and perhaps both, may be redundant given the 1433bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // subsequent check for normalization boundary; however they are likely much faster 1434bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // tests in any case) 1435bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * the match limit is a normalization boundary 1436bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean allowMidclusterMatch = 1437bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert breakIterator == null && 14382d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && 1439bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && 1440bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || 1441bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); 1442bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 1443bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // If those conditions are met, then: 1444bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1445bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // the match limit may be backed off to a previous break boundary. This handles 1446bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // cases in which mLimit includes target characters that are ignorable with current 1447bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // settings (such as space) and which extend beyond the pattern match. 1448bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1449bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // * do NOT require that match limit be on a breakIter boundary 1450bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 14517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Advance the match end position to the first acceptable match boundary. 14527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This advances the index over any combining charcters. 14537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (minLimit < maxLimit) { 14547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nba = nextBoundaryAfter(minLimit); 1455bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Note that we can have nba < maxLimit && nba >= minLImit, in which 1456bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // case we want to set mLimit to nba regardless of allowMidclusterMatch 1457bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // (i.e. we back off mLimit to the previous breakIterator boundary). 1458bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { 14597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = nba; 14607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1463bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (!allowMidclusterMatch) { 1464bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // If advancing to the end of a combining sequence in character indexing space 1465bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // advanced us beyond the end of the match in CE space, reject this match. 1466bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (mLimit > maxLimit) { 1467bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert found = false; 1468bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 14697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1470bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Make sure the end of the match is on a break boundary 1471bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (!isBreakBoundary(mLimit)) { 1472bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert found = false; 1473bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 14747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No non-ignorable CEs after this point. 14787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The maximum position is detected by boundary after 14797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the last non-ignorable CE. Combining sequence 14807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // across the start index will be truncated. 14817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nba = nextBoundaryAfter(minLimit); 14827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 14837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!checkIdentical(mStart, mLimit)) { 14867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert found = false; 14877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (found) { 14907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 14917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All Done. Store back the match bounds to the caller. 14957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 14967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (found == false) { 14977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mLimit = -1; 14987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mStart = -1; 14997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (m != null) { 15027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert m.start_ = mStart; 15037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert m.limit_ = mLimit; 15047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return found; 15077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: 15107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 15117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical() 15127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for the linear search implementation. The differences are addressed in search(). 15137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 15147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean handleNextExact() { 15157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return handleNextCommonImpl(); 15167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean handleNextCanonical() { 15197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return handleNextCommonImpl(); 15207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean handleNextCommonImpl() { 15237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int textOffset = textIter_.getOffset(); 15247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Match match = new Match(); 15257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search(textOffset, match)) { 15277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = match.start_; 15287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.setMatchedLength(match.limit_ - match.start_); 15297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 15307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setMatchNotFound(); 15327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 15337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: 15377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 15387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical() 15397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for the linear search implementation. The differences are addressed in searchBackwards(). 15407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 15417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean handlePreviousExact() { 15427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return handlePreviousCommonImpl(); 15437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean handlePreviousCanonical() { 15467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return handlePreviousCommonImpl(); 15477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean handlePreviousCommonImpl() { 15507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int textOffset; 15517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.isOverlap_) { 15537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (search_.matchedIndex_ != DONE) { 15547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textOffset = search_.matchedIndex_ + search_.matchedLength() - 1; 15557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // move the start position at the end of possible match 15577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initializePatternPCETable(); 15587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!initTextProcessedIter()) { 15597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setMatchNotFound(); 15607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 15617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) { 15637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long pce = textProcessedIter_.nextProcessed(null); 15647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pce == CollationPCE.PROCESSED_NULLORDER) { 15657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at the end of the text 15667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 15677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textOffset = textIter_.getOffset(); 15707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert textOffset = textIter_.getOffset(); 15737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Match match = new Match(); 15767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (searchBackwards(textOffset, match)) { 15777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.matchedIndex_ = match.start_; 15787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert search_.setMatchedLength(match.limit_ - match.start_); 15797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 15807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setMatchNotFound(); 15827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 15837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets a substring out of a CharacterIterator 15887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Java porting note: Not available in ICU4C 15907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param text CharacterIterator 15927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param start start offset 15937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param length of substring 15947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return substring from text starting at start and length length 15957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final String getString(CharacterIterator text, int start, int length) { 15977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder result = new StringBuilder(length); 15987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int offset = text.getIndex(); 15997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(start); 16007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < length; i++) { 16017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(text.current()); 16027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.next(); 16037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(offset); 16057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result.toString(); 16067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Java port of ICU4C struct UPattern (usrchimp.h) 16107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class Pattern { 16127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Pattern string */ 16137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String text_; 16147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] PCE_; 16167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int PCELength_ = 0; 16177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: We probably do not need CE_ / CELength_ 16197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @SuppressWarnings("unused") 16207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] CE_; 16217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int CELength_ = 0; 16227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // *** Boyer-Moore *** 16247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // boolean hasPrefixAccents_ = false; 16257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // boolean hasSuffixAccents_ = false; 16267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // int defaultShiftSize_; 16277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // char[] shift_; 16287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // char[] backShift_; 16297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Pattern(String pattern) { 16317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text_ = pattern; 16327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Java port of ICU4C UCollationPCE (usrchimp.h) 16377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static class CollationPCE { 16397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final long PROCESSED_NULLORDER = -1; 16407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int DEFAULT_BUFFER_SIZE = 16; 16427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int BUFFER_GROW = 8; 16437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: PRIMARYORDERMASK is also duplicated in StringSearch class 16457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int PRIMARYORDERMASK = 0xffff0000; 16467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CONTINUATION_MARKER = 0xc0; 16477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private PCEBuffer pceBuffer_ = new PCEBuffer(); 16497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationElementIterator cei_; 16507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int strength_; 16517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean toShift_; 16527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isShifted_; 16537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int variableTop_; 16547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CollationPCE(CollationElementIterator iter) { 16567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert init(iter); 16577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void init(CollationElementIterator iter) { 16607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cei_ = iter; 16617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert init(iter.getRuleBasedCollator()); 16627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void init(RuleBasedCollator coll) { 16657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength_ = coll.getStrength(); 16667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert toShift_ = coll.isAlternateHandlingShifted(); 16677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isShifted_ = false; 16687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert variableTop_ = coll.getVariableTop(); 16697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @SuppressWarnings("fallthrough") 16727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private long processCE(int ce) { 16737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 16747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This is clean, but somewhat slow... 16767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We could apply the mask to ce and then 16777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // just get all three orders... 16787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (strength_) { 16797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 16807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tertiary = CollationElementIterator.tertiaryOrder(ce); 16817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* note fall-through */ 16827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collator.SECONDARY: 16847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert secondary = CollationElementIterator.secondaryOrder(ce); 16857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* note fall-through */ 16867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collator.PRIMARY: 16887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary = CollationElementIterator.primaryOrder(ce); 16897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** This should probably handle continuations too. **** 16927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** That means that we need 24 bits for the primary **** 16937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** instead of the 16 that we're currently using. **** 16947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** So we can lay out the 64 bits as: 24.12.12.16. **** 16957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** Another complication with continuations is that **** 16967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** the *second* CE is marked as a continuation, so **** 16977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** we always have to peek ahead to know how long **** 16987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** the primary is... **** 16997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) { 17007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (primary == 0) { 17027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CollationElementIterator.IGNORABLE; 17037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (strength_ >= Collator.QUATERNARY) { 17067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quaternary = primary; 17077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary = secondary = tertiary = 0; 17107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isShifted_ = true; 17117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 17127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (strength_ >= Collator.QUATERNARY) { 17137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quaternary = 0xFFFF; 17147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isShifted_ = false; 17177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 17207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Get the processed ordering priority of the next collation element in the text. 17247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A single character may contain more than one collation element. 17257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 17267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note: This is equivalent to 17277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 17287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 17297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param range receiving the iterator index before/after fetching the CE. 17307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER 17317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if an error has occurred or if the end of string has been reached 17327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public long nextProcessed(Range range) { 17347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long result = CollationElementIterator.IGNORABLE; 17357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int low = 0, high = 0; 17367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pceBuffer_.reset(); 17387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 17407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert low = cei_.getOffset(); 17417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce = cei_.next(); 17427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert high = cei_.getOffset(); 17437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce == CollationElementIterator.NULLORDER) { 17457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = PROCESSED_NULLORDER; 17467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 17477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = processCE(ce); 17507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while (result == CollationElementIterator.IGNORABLE); 17517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (range != null) { 17537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert range.ixLow_ = low; 17547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert range.ixHigh_ = high; 17557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 17587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Get the processed ordering priority of the previous collation element in the text. 17627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A single character may contain more than one collation element. 17637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 17647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note: This is equivalent to 17657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 17667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 17677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param range receiving the iterator index before/after fetching the CE. 17687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The previous collation elements ordering, otherwise returns 17697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * PROCESSED_NULLORDER if an error has occurred or if the start of 17707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * string has been reached. 17717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public long previousProcessed(Range range) { 17737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long result = CollationElementIterator.IGNORABLE; 17747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int low = 0, high = 0; 17757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // pceBuffer_.reset(); 17777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (pceBuffer_.empty()) { 17797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // buffer raw CEs up to non-ignorable primary 17807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RCEBuffer rceb = new RCEBuffer(); 17817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce; 17827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean finish = false; 17847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** do we need to reset rceb, or will it always be empty at this point **** 17867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 17877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert high = cei_.getOffset(); 17887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = cei_.previous(); 17897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert low = cei_.getOffset(); 17907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce == CollationElementIterator.NULLORDER) { 17927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!rceb.empty()) { 17937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 17947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert finish = true; 17977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 17987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rceb.put(ce, low, high); 18017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce)); 18027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (finish) { 18047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 18057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // process the raw CEs 18087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (!rceb.empty()) { 18097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RCEI rcei = rceb.get(); 18107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = processCE(rcei.ce_); 18127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result != CollationElementIterator.IGNORABLE) { 18147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pceBuffer_.put(result, rcei.low_, rcei.high_); 18157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pceBuffer_.empty()) { 18207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // **** Is -1 the right value for ixLow, ixHigh? **** 18217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (range != null) { 18227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert range.ixLow_ = -1; 18237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert range.ixHigh_ = -1; 18247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CollationElementIterator.NULLORDER; 18267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert PCEI pcei = pceBuffer_.get(); 18297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (range != null) { 18317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert range.ixLow_ = pcei.low_; 18327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert range.ixHigh_ = pcei.high_; 18337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pcei.ce_; 18367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isContinuation(int ce) { 18397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER); 18407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final class Range { 18437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ixLow_; 18447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ixHigh_; 18457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 18487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class PCEI { 18497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce_; 18507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int low_; 18517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int high_; 18527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class PCEBuffer { 18557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE]; 18567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int bufferIndex_ = 0; 18577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void reset() { 18597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferIndex_ = 0; 18607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean empty() { 18637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return bufferIndex_ <= 0; 18647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void put(long ce, int ixLow, int ixHigh) 18677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert { 18687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bufferIndex_ >= buffer_.length) { 18697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW]; 18707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 18717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_ = newBuffer; 18727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_] = new PCEI(); 18747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_].ce_ = ce; 18757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_].low_ = ixLow; 18767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_].high_ = ixHigh; 18777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferIndex_ += 1; 18797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert PCEI get() { 18827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bufferIndex_ > 0) { 18837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer_[--bufferIndex_]; 18847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return null; 18867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 18907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class RCEI { 18917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce_; 18927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int low_; 18937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int high_; 18947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class RCEBuffer { 18977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE]; 18987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int bufferIndex_ = 0; 18997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean empty() { 19017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return bufferIndex_ <= 0; 19027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void put(int ce, int ixLow, int ixHigh) { 19057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bufferIndex_ >= buffer_.length) { 19067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW]; 19077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 19087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_ = newBuffer; 19097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_] = new RCEI(); 19117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_].ce_ = ce; 19127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_].low_ = ixLow; 19137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer_[bufferIndex_].high_ = ixHigh; 19147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferIndex_ += 1; 19167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RCEI get() { 19197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bufferIndex_ > 0) { 19207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer_[--bufferIndex_]; 19217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return null; 19237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 19287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Java port of ICU4C CEI (usearch.cpp) 19297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * CEI Collation Element + source text index. 19317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * These structs are kept in the circular buffer. 19327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 19337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static class CEI { 19347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce_; 19357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lowIndex_; 19367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int highIndex_; 19377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 19407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * CEBuffer A circular buffer of CEs from the text being searched 19417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 19427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static class CEBuffer { 19437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: ICU4C uses the size for stack buffer 19447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // static final int DEFAULT_CEBUFFER_SIZE = 96; 19457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static final int CEBUFFER_EXTRA = 32; 19477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8; 19487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3; 19497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI[] buf_; 19517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int bufSize_; 19527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstIx_; 19537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limitIx_; 19547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: No references in ICU4C implementation 19567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // CollationElementIterator ceIter_; 19577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringSearch strSearch_; 19597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEBuffer(StringSearch ss) { 19617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strSearch_ = ss; 19627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA; 19637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 19647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String patText = ss.pattern_.text_; 19657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (patText != null) { 19667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < patText.length(); i++) { 19677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c = patText.charAt(i); 19687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (MIGHT_BE_JAMO_L(c)) { 19697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 19707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 19717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No check for surrogates, we might allocate slightly more buffer than necessary. 19727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 19737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Not used - see above 19797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ceIter_ = ss.textIter_; 19807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstIx_ = 0; 19827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limitIx_ = 0; 19837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!ss.initTextProcessedIter()) { 19857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 19867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_ = new CEI[bufSize_]; 19897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the CE with the specified index. 19927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Index must be in the range 19937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // n-history_size < index < n+1 19947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // where n is the largest index to have been fetched by some previous call to this function. 19957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 19967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 19977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI get(int index) { 19987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i = index % bufSize_; 19997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (index >= firstIx_ && index < limitIx_) { 20017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The request was for an entry already in our buffer. 20027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Just return it. 20037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buf_[i]; 20047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Caller is requesting a new, never accessed before, CE. 20077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Verify that it is the next one in sequence, which is all 20087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // that is allowed. 20097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (index != limitIx_) { 20107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(false); 20117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return null; 20127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Manage the circular CE buffer indexing 20157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limitIx_++; 20167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (limitIx_ - firstIx_ >= bufSize_) { 20187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The buffer is full, knock out the lowest-indexed entry. 20197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstIx_++; 20207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationPCE.Range range = new CollationPCE.Range(); 20237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (buf_[i] == null) { 20247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i] = new CEI(); 20257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range); 20277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i].lowIndex_ = range.ixLow_; 20287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i].highIndex_ = range.ixHigh_; 20297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buf_[i]; 20317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the CE with the specified index. 20347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Index must be in the range 20357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // n-history_size < index < n+1 20367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // where n is the largest index to have been fetched by some previous call to this function. 20377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 20387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 20397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEI getPrevious(int index) { 20407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i = index % bufSize_; 20417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (index >= firstIx_ && index < limitIx_) { 20437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The request was for an entry already in our buffer. 20447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Just return it. 20457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buf_[i]; 20467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Caller is requesting a new, never accessed before, CE. 20497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Verify that it is the next one in sequence, which is all 20507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // that is allowed. 20517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (index != limitIx_) { 20527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(false); 20537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return null; 20547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Manage the circular CE buffer indexing 20577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limitIx_++; 20587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (limitIx_ - firstIx_ >= bufSize_) { 20607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The buffer is full, knock out the lowest-indexed entry. 20617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstIx_++; 20627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationPCE.Range range = new CollationPCE.Range(); 20657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (buf_[i] == null) { 20667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i] = new CEI(); 20677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range); 20697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i].lowIndex_ = range.ixLow_; 20707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf_[i].highIndex_ = range.ixHigh_; 20717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buf_[i]; 20737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static boolean MIGHT_BE_JAMO_L(char c) { 20767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (c >= 0x1100 && c <= 0x115E) 20777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || (c >= 0x3131 && c <= 0x314E) 20787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || (c >= 0x3165 && c <= 0x3186); 20797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 2082