1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Copyright (C) 2007, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: unisetspan.cpp 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007mar01 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus W. Scherer 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unisetspan.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * List of offsets from the current position from where to try matching 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a code point or a string. 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Store offsets rather than indexes to simplify the code and use the same list 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for both increments (in span()) and decrements (in spanBack()). 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Assumption: The maximum offset is limited, and the offsets that are stored 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * at any one time are relatively dense, that is, there are normally no gaps of 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * hundreds or thousands of offset values. 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The implementation uses a circular buffer of byte flags, 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * each indicating whether the corresponding offset is in the list. 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This avoids inserting into a sorted list of offsets (or absolute indexes) and 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * physically moving part of the list. 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In principle, the caller should setMaxLength() to the maximum of the 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * max string length and U16_LENGTH/U8_LENGTH to account for 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * "long" single code points. 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * However, this implementation uses at least a staticList with more than 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U8_LENGTH entries anyway. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: If maxLength were guaranteed to be no more than 32 or 64, 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the list could be stored as bit flags in a single integer. 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Rather than handling a circular buffer with a start list index, 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the integer would simply be shifted when lower offsets are removed. 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeSet does not have a limit on the lengths of strings. 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~OffsetList() { 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(list!=staticList) { 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(list); 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Call exactly once if the list is to be used. 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void setMaxLength(int32_t maxLength) { 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxLength<=(int32_t)sizeof(staticList)) { 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru capacity=(int32_t)sizeof(staticList); 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool *l=(UBool *)uprv_malloc(maxLength); 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(l!=NULL) { 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list=l; 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru capacity=maxLength; 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(list, 0, capacity); 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void clear() { 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(list, 0, capacity); 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=length=0; 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool isEmpty() const { 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)(length==0); 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Reduce all stored offsets by delta, used when the current position 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // moves by delta. 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There must not be any offsets lower than delta. 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If there is an offset equal to delta, it is removed. 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // delta=[1..maxLength] 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void shift(int32_t delta) { 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start+delta; 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i>=capacity) { 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i-=capacity; 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(list[i]) { 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=FALSE; 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --length; 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=i; 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Add an offset. The list must not contain it yet. 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // offset=[1..maxLength] 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void addOffset(int32_t offset) { 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start+offset; 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i>=capacity) { 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i-=capacity; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=TRUE; 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++length; 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // offset=[1..maxLength] 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool containsOffset(int32_t offset) const { 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start+offset; 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i>=capacity) { 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i-=capacity; 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return list[i]; 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Find the lowest stored offset from a non-empty list, remove it, 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and reduce all other offsets by this minimum. 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Returns [1..maxLength]. 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t popMinimum() { 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for the next offset in list[start+1..capacity-1]. 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start, result; 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(++i<capacity) { 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(list[i]) { 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=FALSE; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --length; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result=i-start; 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=i; 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // i==capacity 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Wrap around and look for the next offset in list[0..start]. 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Since the list is not empty, there will be one. 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result=capacity-start; 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=0; 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(!list[i]) { 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++i; 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=FALSE; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --length; 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=i; 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result+=i; 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool *list; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t capacity; 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool staticList[16]; 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Get the number of UTF-8 bytes for a UTF-16 (sub)string. 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetUTF8Length(const UChar *s, int32_t length) { 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode errorCode=U_ZERO_ERROR; 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=0; 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length8; 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The string contains an unpaired surrogate. 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Ignore this string. 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruappendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode errorCode=U_ZERO_ERROR; 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=0; 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_SUCCESS(errorCode)) { 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length8; 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The string contains an unpaired surrogate. 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Ignore this string. 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline uint8_t 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeSpanLengthByte(int32_t spanLength) { 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 0xfe==UnicodeSetStringSpan::LONG_SPAN 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Construct for all variants of span(), or only for any one variant. 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Initialize as little as possible, for single use. 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UVector &setStrings, 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t which) 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Length(0), 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16(0), maxLength8(0), 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru all((UBool)(which==ALL)) { 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanSet.retainAll(set); 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&NOT_CONTAINED) { 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Default to the same sets. 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // addToSpanNotSet() will create a separate set if necessary. 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=&spanSet; 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Determine if the strings even need to be taken into account at all for span() etc. 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If any string is relevant, then all strings need to be used for 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // span(longest match) but only the relevant ones for span(while contained). 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t stringsLength=strings.size(); 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, spanLength; 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool someRelevant=FALSE; 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool thisRelevant; 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength<length16) { // Relevant string. 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru someRelevant=thisRelevant=TRUE; 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru thisRelevant=FALSE; 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((which&UTF16) && length16>maxLength16) { 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=length16; 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=getUTF8Length(s16, length16); 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Length+=length8; 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8>maxLength8) { 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength8=length8; 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!someRelevant) { 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=maxLength8=0; 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Freeze after checking for the need to use strings at all because freezing 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // a set takes some time and memory which are wasted if there are no relevant strings. 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanSet.freeze(); 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackLengths; 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanUTF8Lengths; 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackUTF8Lengths; 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Allocate a block of meta data. 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t allocSize; 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru allocSize=stringsLength; // One set of span lengths. 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UTF-8 lengths and UTF-8 strings. 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru allocSize+=stringsLength*4+utf8Length; 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(allocSize<=(int32_t)sizeof(staticLengths)) { 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=staticLengths; 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=(int32_t *)uprv_malloc(allocSize); 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(utf8Lengths==NULL) { 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Out of memory. 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Store span lengths for all span() variants. 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths=spanLengths+stringsLength; 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths=spanBackLengths+stringsLength; 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8=spanBackUTF8Lengths+stringsLength; 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Store span lengths for only one span() variant. 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8=spanLengths+stringsLength; 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)utf8Lengths; 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set the meta data and pSpanNotSet and write the UTF-8 strings. 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t utf8Count=0; // Count UTF-8 bytes written so far. 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength<length16) { // Relevant string. 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF16) { 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&CONTAINED) { 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&FWD) { 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=makeSpanLengthByte(spanLength); 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&BACK) { 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths[i]=makeSpanLengthByte(spanLength); 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *s8=utf8+utf8Count; 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Count+=utf8Lengths[i]=length8; 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { // Relevant for UTF-8. 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&CONTAINED) { 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&FWD) { 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&BACK) { 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&NOT_CONTAINED) { 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Add string start and end code points to the spanNotSet so that 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // a span(while not contained) stops before any string. 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c; 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&FWD) { 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len=0; 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_NEXT(s16, len, length16, c); 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru addToSpanNotSet(c); 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&BACK) { 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len=length16; 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_PREV(s16, 0, len, c); 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru addToSpanNotSet(c); 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { // Irrelevant string. 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *s8=utf8+utf8Count; 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Count+=utf8Lengths[i]=length8; 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths[i]=0; 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=spanBackLengths[i]= 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (uint8_t)ALL_CP_CONTAINED; 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All spanXYZLengths pointers contain the same address. 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finish. 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet->freeze(); 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Copy constructor. Assumes which==ALL for a frozen set. 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UVector &newParentSetStrings) 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Length(otherStringSpan.utf8Length), 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru all(TRUE) { 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=&spanSet; 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Allocate a block of meta data. 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t stringsLength=strings.size(); 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(allocSize<=(int32_t)sizeof(staticLengths)) { 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=staticLengths; 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=(int32_t *)uprv_malloc(allocSize); 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(utf8Lengths==NULL) { 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Out of memory. 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8=spanLengths+stringsLength*4; 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::~UnicodeSetStringSpan() { 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete pSpanNotSet; 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(utf8Lengths); 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanSet.contains(c)) { 433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Nothing to do. 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(newSet==NULL) { 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Out of memory. 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=newSet; 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet->add(c); 443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare strings without any argument checks. Requires length>0. 446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool 447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16(const UChar *s, const UChar *t, int32_t length) { 448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(*s++!=*t++) { 450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(--length>0); 453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool 457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches8(const uint8_t *s, const uint8_t *t, int32_t length) { 458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(*s++!=*t++) { 460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(--length>0); 463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare 16-bit Unicode strings (which may be malformed UTF-16) 467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// at code point boundaries. 468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// That is, each edge of a match must not be in the middle of a surrogate pair. 469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool 470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s+=start; 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit-=start; 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return matches16(s, t, length) && 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Does the set contain the next code point? 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// If so, return its length; otherwise return its negative length. 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c=*s, c2; 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c=s[length-1], c2; 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c=*s; 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)c>=0) { 502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Take advantage of non-ASCII fastpaths in U8_NEXT(). 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=0; 506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U8_NEXT(s, i, length, c); 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? i : -i; 508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c=s[length-1]; 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)c>=0) { 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=length-1; 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c=utf8_prevCharSafeBody(s, 0, &i, c, -1); 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length-=i; 519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? length : -length; 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In span() when spanLength==0 (after a string match, or at the beginning 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * after an empty code point span) and in spanNot() and spanNotUTF8(), 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * string matching could use a binary search 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because all string matches are done from the same start index. 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For UTF-8, this would require a comparison function that returns UTF-16 order. 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This optimization should not be necessary for normal UnicodeSets because 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * most sets have no strings, and most sets with strings have 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * very few very short strings. 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For cases with many strings, it might be better to use a different API 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and implementation with a DFA (state machine). 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_CONTAINED) 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm: 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary: 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the code point there is in the set, then remember to continue after it. 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If a set string matches at the current position, then remember to continue after it. 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Either recursively span for each code point or string match, 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * or recursively span for all but the shortest one and 546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * iteratively continue the span with the shortest local match. 547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Remember the longest recursive span (the farthest end point). 548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If there is no match at the current position, neither for the code point there 549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * nor for any set string, then stop and return the longest recursive span length. 550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation: 552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (We assume that most sets will have very few very short strings. 554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A span using a string-less set is extremely fast.) 555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanSet which contains all of the single code points 557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings. 558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop: 561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Try to match each set string at the end of the spanLength. 562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that start with set-contained code points must be matched 563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with a partial overlap because the recursive algorithm would have tried 564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to match them at every position. 565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that entirely consist of set-contained code points 566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * are irrelevant for span(USET_SPAN_CONTAINED) because the 567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * recursive algorithm would continue after them anyway 568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and find the longest recursive match from their end. 569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Rather than recursing, note each end point of a set string match. 570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no set string matched after spanSet.span(), then return 571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with where the spanSet.span() ended. 572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If at least one set string matched after spanSet.span(), then 573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * pop the shortest string match end point and continue 574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the loop, trying to match all set strings from there. 575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If at least one more set string matched after a previous string match, 576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then test if the code point after the previous string match is also 577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * contained in the set. 578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Continue the loop with the shortest end point of either this code point 579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * or a matching set string. 580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no more set string matched after a previous string match, 581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Stop if spanLength==0, otherwise continue the loop. 583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * By noting each end point of a set string match, 585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the function visits each string position at most once and finishes 586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in linear time. 587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The recursive algorithm may visit the same string position many times 589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * if multiple paths lead to it and finishes in exponential time. 590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_SIMPLE) 594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm: 596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary: 597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the code point there is in the set, then remember to continue after it. 598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If a set string matches at the current position, then remember to continue after it. 599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Continue from the farthest match position and ignore all others. 600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If there is no match at the current position, 601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then stop and return the current position. 602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation: 604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption and spanSet as above.) 606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop: 609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Try to match each set string at the end of the spanLength. 610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that start with set-contained code points must be matched 611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with a partial overlap because the standard algorithm would have tried 612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to match them earlier. 613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that entirely consist of set-contained code points 614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * must be matched with a full overlap because the longest-match algorithm 615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * would hide set string matches that end earlier. 616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Such set strings need not be matched earlier inside the code point span 617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because the standard algorithm would then have continued after 618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the set string match anyway. 619ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Remember the longest set string match (farthest end point) from the earliest 620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * starting point. 621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no set string matched after spanSet.span(), then return 622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with where the spanSet.span() ended. 623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If at least one set string matched, then continue the loop after the 624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * longest match from the earliest position. 625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no more set string matched after a previous string match, 626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Stop if spanLength==0, otherwise continue the loop. 628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 632ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNot(s, length); 633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==length) { 636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; 637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength16); 644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanLength, rest=length-pos; 646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanLengths[i]; 651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length16-overlap; // Keep overlap+inc==length16. 668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest) { 670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the increment is not listed already. 673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc==rest) { 675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(inc); 678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxInc=0, maxOverlap=0; 688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanLengths[i]; 690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 693ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 694ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 695ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 698ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length16-overlap; // Keep overlap+inc==length16. 707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest || overlap<maxOverlap) { 709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or starts earlier. 712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches16CPB(s, pos-overlap, length, s16, length16) 714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxInc=inc; // Longest match from earliest start. 716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxInc!=0 || maxOverlap!=0) { 725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 726ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue after it. 727ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=maxInc; 728ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=maxInc; 729ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(rest==0) { 730ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 731ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 732ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 733ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 734ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 735ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 736ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 737ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 738ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==0) { 739ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after an unlimited code point span (spanLength!=0), 740ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not after a string match. 741ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 after a span is pos==0. 742ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 743ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 744ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 745ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched after a span. 746ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 747ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 748ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 749ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after a string match (or a single code point). 750ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 751ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched after a previous string match. 752ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from after the last string match. 753ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 754ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( spanLength==rest || // Reached the end of the string, or 755ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 756ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 757ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos+spanLength; 758ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 759ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 760ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 761ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from after a span. 762ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 763ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from after a string match if some 764ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 765ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 766ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOne(spanSet, s+pos, rest); 767ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 768ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==rest) { 769ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 770ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 771ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings after this code point. 772ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any increments below it because UnicodeSet strings 773ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 774ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 775ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 776ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 777ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 778ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from after a single code point. 779ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 780ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 781ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 782ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 783ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t minOffset=offsets.popMinimum(); 784ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=minOffset; 785ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=minOffset; 786ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 787ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 788ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 789ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 790ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 791ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 792ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNotBack(s, length); 793ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 794ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 795ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 796ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 797ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 798ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=length-pos; 799ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 800ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 801ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 802ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 803ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 804ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength16); 805ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 806ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 807ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackLengths=spanLengths; 808ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 809ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths+=stringsLength; 810ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 811ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 812ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 813ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 814ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackLengths[i]; 815ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 816ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 817ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 818ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 819ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 820ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 821ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 822ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length16-overlap)..pos-length16. 823ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 824ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 825ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 826ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len1=0; 827ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_FWD_1(s16, len1, overlap); 828ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap-=len1; // Length of the string minus the first code point. 829ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 830ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 831ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 832ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 833ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length16-overlap; // Keep dec+overlap==length16. 834ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 835ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos) { 836ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 837ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 838ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the decrement is not listed already. 839ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 840ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec==pos) { 841ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 842ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 843ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(dec); 844ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 845ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 846ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 847ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 848ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 849ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 850ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 851ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 852ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 853ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxDec=0, maxOverlap=0; 854ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 855ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackLengths[i]; 856ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 857ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 858ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 859ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 860ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 861ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 862ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 863ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length16-overlap)..pos-length16. 864ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 865ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 866ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 867ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 868ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 869ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 870ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 871ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 872ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length16-overlap; // Keep dec+overlap==length16. 873ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 874ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos || overlap<maxOverlap) { 875ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 876ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 877ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or ends later. 878ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 879ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches16CPB(s, pos-dec, length, s16, length16) 880ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 881ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxDec=dec; // Longest match from latest end. 882ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 883ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 884ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 885ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 886ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 887ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 888ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 889ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 890ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxDec!=0 || maxOverlap!=0) { 891ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 892ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue before it. 893ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=maxDec; 894ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 895ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 896ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 897ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 898ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 899ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 900ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 901ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 902ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 903ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==length) { 904ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before an unlimited code point span (spanLength!=0), 905ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not before a string match. 906ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 before a span is pos==length. 907ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 908ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 909ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 910ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched before a span. 911ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 912ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 913ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 914ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before a string match (or a single code point). 915ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 916ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched before a previous string match. 917ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from before the last string match. 918ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t oldPos=pos; 919ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 920ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=oldPos-pos; 921ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( pos==0 || // Reached the start of the string, or 922ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 923ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 924ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; 925ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 926ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from before a span. 927ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 928ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from before a string match if some 929ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 930ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 931ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOneBack(spanSet, s, pos); 932ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 933ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==pos) { 934ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 935ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 936ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings before this code point. 937ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any decrements below it because UnicodeSet strings 938ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 939ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=spanLength; 940ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 941ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 942ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from before a single code point. 943ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 944ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 945ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 946ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 947ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=offsets.popMinimum(); 948ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 949ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 950ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 951ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 952ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 953ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 954ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNotUTF8(s, length); 955ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 956ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 957ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==length) { 958ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; 959ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 960ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 961ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 962ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 963ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 964ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 965ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength8); 966ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 967ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanLength, rest=length-pos; 968ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 969ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanUTF8Lengths=spanLengths; 970ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 971ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths+=2*stringsLength; 972ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 973ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 974ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 975ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 976ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 977ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 978ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 979ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 980ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 981ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 982ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanUTF8Lengths[i]; 983ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 984ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 985ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 986ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 987ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 988ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 989ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 990ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 991ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 992ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 993ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 994ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 995ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 996ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 997ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length8-overlap; // Keep overlap+inc==length8. 998ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 999ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest) { 1000ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1001ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1002ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the increment is not listed already. 1003ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1004ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1005ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-overlap]) && 1006ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !offsets.containsOffset(inc) && 1007ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-overlap, s8, length8) 1008ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1009ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1010ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc==rest) { 1011ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1012ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1013ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(inc); 1014ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1015ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 1016ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1017ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1018ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1019ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 1020ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1021ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1022ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1023ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 1024ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxInc=0, maxOverlap=0; 1025ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1026ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1027ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 1028ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 1029ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1030ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanUTF8Lengths[i]; 1031ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 1032ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 1033ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1034ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 1035ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 1036ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 1037ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 1038ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 1039ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1040ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 1041ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 1042ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1043ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1044ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1045ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest || overlap<maxOverlap) { 1046ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1047ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1048ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or starts earlier. 1049ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1050ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1051ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-overlap]) && 1052ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1053ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-overlap, s8, length8) 1054ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1055ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1056ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxInc=inc; // Longest match from earliest start. 1057ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 1058ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1059ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1060ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1061ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 1062ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1063ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1064ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1065ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1066ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxInc!=0 || maxOverlap!=0) { 1067ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 1068ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue after it. 1069ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=maxInc; 1070ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=maxInc; 1071ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(rest==0) { 1072ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1073ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1074ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 1075ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 1076ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1077ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1078ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 1079ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1080ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==0) { 1081ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after an unlimited code point span (spanLength!=0), 1082ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not after a string match. 1083ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 after a span is pos==0. 1084ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 1085ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 1086ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1087ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched after a span. 1088ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1089ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 1090ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1091ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after a string match (or a single code point). 1092ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1093ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched after a previous string match. 1094ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from after the last string match. 1095ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1096ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( spanLength==rest || // Reached the end of the string, or 1097ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 1098ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1099ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos+spanLength; 1100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 1102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 1103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from after a span. 1104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from after a string match if some 1106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 1107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 1108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOneUTF8(spanSet, s+pos, rest); 1109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 1110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==rest) { 1111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings after this code point. 1114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any increments below it because UnicodeSet strings 1115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 1116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 1117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 1118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 1119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 1120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from after a single code point. 1121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 1123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t minOffset=offsets.popMinimum(); 1126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=minOffset; 1127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=minOffset; 1128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 1129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNotBackUTF8(s, length); 1135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 1139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=length-pos; 1141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 1143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 1144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 1145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 1146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength8); 1147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackUTF8Lengths=spanLengths; 1150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 1151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths+=3*stringsLength; 1152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 1155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 1156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 1157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 1160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 1161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackUTF8Lengths[i]; 1163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 1164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 1166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length8-overlap)..pos-length8. 1169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 1170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 1171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 1172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len1=0; 1173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U8_FWD_1(s8, len1, overlap); 1174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap-=len1; // Length of the string minus the first code point. 1175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 1177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 1178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos) { 1182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the decrement is not listed already. 1185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-dec]) && 1188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !offsets.containsOffset(dec) && 1189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-dec, s8, length8) 1190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec==pos) { 1192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(dec); 1195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 1197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 1201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 1205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxDec=0, maxOverlap=0; 1206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 1209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 1210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackUTF8Lengths[i]; 1212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 1213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 1214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length8-overlap)..pos-length8. 1216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 1217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 1218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 1219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 1220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 1222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 1223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos || overlap<maxOverlap) { 1227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or ends later. 1230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-dec]) && 1233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-dec, s8, length8) 1235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxDec=dec; // Longest match from latest end. 1237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 1238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 1242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxDec!=0 || maxOverlap!=0) { 1247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 1248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue before it. 1249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=maxDec; 1250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 1254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 1255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 1258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==length) { 1260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before an unlimited code point span (spanLength!=0), 1261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not before a string match. 1262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 before a span is pos==length. 1263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 1264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 1265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched before a span. 1267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 1269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before a string match (or a single code point). 1271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched before a previous string match. 1273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from before the last string match. 1274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t oldPos=pos; 1275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=oldPos-pos; 1277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( pos==0 || // Reached the start of the string, or 1278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 1279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; 1281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from before a span. 1283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from before a string match if some 1285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 1286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 1287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOneBackUTF8(spanSet, s, pos); 1288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 1289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==pos) { 1290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings before this code point. 1293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any decrements below it because UnicodeSet strings 1294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 1295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=spanLength; 1296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 1297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 1298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from before a single code point. 1299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 1301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=offsets.popMinimum(); 1304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 1305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 1309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm: 1312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary: 1313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the code point there is in the set, then return with the current position. 1314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If a set string matches at the current position, then return with the current position. 1315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation: 1317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption as for span() above.) 1319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanNotSet which contains all of the single code points 1321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings. 1322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For each set string add its initial code point to the spanNotSet. 1323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Also add its final code point for spanNotBack().) 1324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop: 1326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the current code point is in the original set, then 1328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * return the current position. 1329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If any set string matches at the current position, then 1330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * return the current position. 1331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If there is no match at the current position, neither for the code point there 1332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * nor for any set string, then skip this code point and continue the loop. 1333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This happens for set-string-initial code points that were added to spanNotSet 1334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * when there is not actually a match for such a set string. 1335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 1336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=0, rest=length; 1339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i==rest) { 1345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=i; 1348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=i; 1349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOne(spanSet, s+pos, rest); 1353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLengths[i]==ALL_CP_CONTAINED) { 1360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 1361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 1364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 1365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=cpLength; 1374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest+=cpLength; 1375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(rest!=0); 1376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=length; 1381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOneBack(spanSet, s, pos); 1393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use spanLengths rather than a spanBackLengths pointer because 1400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // it is easier and we only need to know whether the string is irrelevant 1401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // which is the same in either array. 1402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLengths[i]==ALL_CP_CONTAINED) { 1403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 1404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 1407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 1408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=cpLength; 1417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(pos!=0); 1418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=0, rest=length; 1423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanUTF8Lengths=spanLengths; 1425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 1426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths+=2*stringsLength; 1427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i==rest) { 1433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=i; 1436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=i; 1437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 1447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 1448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // ALL_CP_CONTAINED: Irrelevant string. 1451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=cpLength; 1461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest+=cpLength; 1462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(rest!=0); 1463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=length; 1468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackUTF8Lengths=spanLengths; 1470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 1471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths+=3*stringsLength; 1472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 1490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 1491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // ALL_CP_CONTAINED: Irrelevant string. 1494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=cpLength; 1504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(pos!=0); 1505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 1509