1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 48393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius* Copyright (C) 2007-2012, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: unisetspan.cpp 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007mar01 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus W. Scherer 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h" 20103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf8.h" 21103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf16.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unisetspan.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * List of offsets from the current position from where to try matching 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a code point or a string. 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Store offsets rather than indexes to simplify the code and use the same list 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for both increments (in span()) and decrements (in spanBack()). 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Assumption: The maximum offset is limited, and the offsets that are stored 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * at any one time are relatively dense, that is, there are normally no gaps of 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * hundreds or thousands of offset values. 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The implementation uses a circular buffer of byte flags, 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * each indicating whether the corresponding offset is in the list. 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This avoids inserting into a sorted list of offsets (or absolute indexes) and 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * physically moving part of the list. 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In principle, the caller should setMaxLength() to the maximum of the 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * max string length and U16_LENGTH/U8_LENGTH to account for 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * "long" single code points. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * However, this implementation uses at least a staticList with more than 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U8_LENGTH entries anyway. 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: If maxLength were guaranteed to be no more than 32 or 64, 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the list could be stored as bit flags in a single integer. 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Rather than handling a circular buffer with a start list index, 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the integer would simply be shifted when lower offsets are removed. 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeSet does not have a limit on the lengths of strings. 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~OffsetList() { 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(list!=staticList) { 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(list); 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Call exactly once if the list is to be used. 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void setMaxLength(int32_t maxLength) { 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxLength<=(int32_t)sizeof(staticList)) { 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru capacity=(int32_t)sizeof(staticList); 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool *l=(UBool *)uprv_malloc(maxLength); 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(l!=NULL) { 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list=l; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru capacity=maxLength; 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(list, 0, capacity); 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void clear() { 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(list, 0, capacity); 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=length=0; 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool isEmpty() const { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)(length==0); 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Reduce all stored offsets by delta, used when the current position 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // moves by delta. 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There must not be any offsets lower than delta. 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If there is an offset equal to delta, it is removed. 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // delta=[1..maxLength] 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void shift(int32_t delta) { 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start+delta; 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i>=capacity) { 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i-=capacity; 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(list[i]) { 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=FALSE; 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --length; 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=i; 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Add an offset. The list must not contain it yet. 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // offset=[1..maxLength] 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void addOffset(int32_t offset) { 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start+offset; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i>=capacity) { 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i-=capacity; 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=TRUE; 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++length; 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // offset=[1..maxLength] 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool containsOffset(int32_t offset) const { 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start+offset; 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i>=capacity) { 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i-=capacity; 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return list[i]; 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Find the lowest stored offset from a non-empty list, remove it, 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and reduce all other offsets by this minimum. 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Returns [1..maxLength]. 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t popMinimum() { 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for the next offset in list[start+1..capacity-1]. 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=start, result; 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(++i<capacity) { 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(list[i]) { 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=FALSE; 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --length; 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result=i-start; 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=i; 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // i==capacity 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Wrap around and look for the next offset in list[0..start]. 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Since the list is not empty, there will be one. 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result=capacity-start; 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=0; 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(!list[i]) { 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++i; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list[i]=FALSE; 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --length; 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=i; 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result+=i; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool *list; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t capacity; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length; 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start; 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool staticList[16]; 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Get the number of UTF-8 bytes for a UTF-16 (sub)string. 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetUTF8Length(const UChar *s, int32_t length) { 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode errorCode=U_ZERO_ERROR; 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=0; 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length8; 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The string contains an unpaired surrogate. 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Ignore this string. 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruappendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode errorCode=U_ZERO_ERROR; 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=0; 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_SUCCESS(errorCode)) { 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length8; 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The string contains an unpaired surrogate. 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Ignore this string. 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline uint8_t 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeSpanLengthByte(int32_t spanLength) { 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 0xfe==UnicodeSetStringSpan::LONG_SPAN 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Construct for all variants of span(), or only for any one variant. 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Initialize as little as possible, for single use. 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UVector &setStrings, 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t which) 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Length(0), 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16(0), maxLength8(0), 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru all((UBool)(which==ALL)) { 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanSet.retainAll(set); 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&NOT_CONTAINED) { 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Default to the same sets. 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // addToSpanNotSet() will create a separate set if necessary. 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=&spanSet; 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Determine if the strings even need to be taken into account at all for span() etc. 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If any string is relevant, then all strings need to be used for 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // span(longest match) but only the relevant ones for span(while contained). 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t stringsLength=strings.size(); 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, spanLength; 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool someRelevant=FALSE; 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool thisRelevant; 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength<length16) { // Relevant string. 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru someRelevant=thisRelevant=TRUE; 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru thisRelevant=FALSE; 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((which&UTF16) && length16>maxLength16) { 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=length16; 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=getUTF8Length(s16, length16); 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Length+=length8; 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8>maxLength8) { 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength8=length8; 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!someRelevant) { 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=maxLength8=0; 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Freeze after checking for the need to use strings at all because freezing 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // a set takes some time and memory which are wasted if there are no relevant strings. 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanSet.freeze(); 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackLengths; 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanUTF8Lengths; 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackUTF8Lengths; 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Allocate a block of meta data. 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t allocSize; 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru allocSize=stringsLength; // One set of span lengths. 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UTF-8 lengths and UTF-8 strings. 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru allocSize+=stringsLength*4+utf8Length; 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(allocSize<=(int32_t)sizeof(staticLengths)) { 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=staticLengths; 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=(int32_t *)uprv_malloc(allocSize); 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(utf8Lengths==NULL) { 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Out of memory. 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Store span lengths for all span() variants. 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths=spanLengths+stringsLength; 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths=spanBackLengths+stringsLength; 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8=spanBackUTF8Lengths+stringsLength; 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Store span lengths for only one span() variant. 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8=spanLengths+stringsLength; 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)utf8Lengths; 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set the meta data and pSpanNotSet and write the UTF-8 strings. 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t utf8Count=0; // Count UTF-8 bytes written so far. 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength<length16) { // Relevant string. 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF16) { 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&CONTAINED) { 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&FWD) { 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=makeSpanLengthByte(spanLength); 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&BACK) { 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths[i]=makeSpanLengthByte(spanLength); 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *s8=utf8+utf8Count; 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Count+=utf8Lengths[i]=length8; 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { // Relevant for UTF-8. 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&CONTAINED) { 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&FWD) { 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&BACK) { 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&NOT_CONTAINED) { 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Add string start and end code points to the spanNotSet so that 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // a span(while not contained) stops before any string. 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c; 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&FWD) { 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len=0; 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_NEXT(s16, len, length16, c); 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru addToSpanNotSet(c); 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&BACK) { 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len=length16; 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_PREV(s16, 0, len, c); 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru addToSpanNotSet(c); 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { // Irrelevant string. 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&UTF8) { 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *s8=utf8+utf8Count; 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Count+=utf8Lengths[i]=length8; 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths[i]=0; 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=spanBackLengths[i]= 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (uint8_t)ALL_CP_CONTAINED; 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All spanXYZLengths pointers contain the same address. 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finish. 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet->freeze(); 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Copy constructor. Assumes which==ALL for a frozen set. 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UVector &newParentSetStrings) 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Length(otherStringSpan.utf8Length), 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru all(TRUE) { 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=&spanSet; 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Allocate a block of meta data. 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t stringsLength=strings.size(); 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(allocSize<=(int32_t)sizeof(staticLengths)) { 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=staticLengths; 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8Lengths=(int32_t *)uprv_malloc(allocSize); 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(utf8Lengths==NULL) { 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Out of memory. 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utf8=spanLengths+stringsLength*4; 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::~UnicodeSetStringSpan() { 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete pSpanNotSet; 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(utf8Lengths); 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanSet.contains(c)) { 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Nothing to do. 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(newSet==NULL) { 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; // Out of memory. 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet=newSet; 442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pSpanNotSet->add(c); 445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare strings without any argument checks. Requires length>0. 448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool 449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16(const UChar *s, const UChar *t, int32_t length) { 450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(*s++!=*t++) { 452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(--length>0); 455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool 459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches8(const uint8_t *s, const uint8_t *t, int32_t length) { 460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(*s++!=*t++) { 462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(--length>0); 465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare 16-bit Unicode strings (which may be malformed UTF-16) 469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// at code point boundaries. 470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// That is, each edge of a match must not be in the middle of a surrogate pair. 471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s+=start; 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit-=start; 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return matches16(s, t, length) && 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Does the set contain the next code point? 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// If so, return its length; otherwise return its negative length. 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c=*s, c2; 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c=s[length-1], c2; 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c=*s; 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)c>=0) { 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 5068393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=0; 5088393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius U8_NEXT_OR_FFFD(s, i, length, c); 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? i : -i; 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c=s[length-1]; 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)c>=0) { 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? 1 : -1; 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=length-1; 5198393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length-=i; 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return set.contains(c) ? length : -length; 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In span() when spanLength==0 (after a string match, or at the beginning 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * after an empty code point span) and in spanNot() and spanNotUTF8(), 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * string matching could use a binary search 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because all string matches are done from the same start index. 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For UTF-8, this would require a comparison function that returns UTF-16 order. 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This optimization should not be necessary for normal UnicodeSets because 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * most sets have no strings, and most sets with strings have 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * very few very short strings. 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For cases with many strings, it might be better to use a different API 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and implementation with a DFA (state machine). 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_CONTAINED) 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm: 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary: 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the code point there is in the set, then remember to continue after it. 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If a set string matches at the current position, then remember to continue after it. 546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Either recursively span for each code point or string match, 547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * or recursively span for all but the shortest one and 548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * iteratively continue the span with the shortest local match. 549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Remember the longest recursive span (the farthest end point). 550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If there is no match at the current position, neither for the code point there 551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * nor for any set string, then stop and return the longest recursive span length. 552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation: 554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (We assume that most sets will have very few very short strings. 556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A span using a string-less set is extremely fast.) 557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanSet which contains all of the single code points 559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings. 560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop: 563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Try to match each set string at the end of the spanLength. 564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that start with set-contained code points must be matched 565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with a partial overlap because the recursive algorithm would have tried 566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to match them at every position. 567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that entirely consist of set-contained code points 568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * are irrelevant for span(USET_SPAN_CONTAINED) because the 569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * recursive algorithm would continue after them anyway 570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and find the longest recursive match from their end. 571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Rather than recursing, note each end point of a set string match. 572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no set string matched after spanSet.span(), then return 573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with where the spanSet.span() ended. 574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If at least one set string matched after spanSet.span(), then 575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * pop the shortest string match end point and continue 576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the loop, trying to match all set strings from there. 577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If at least one more set string matched after a previous string match, 578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then test if the code point after the previous string match is also 579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * contained in the set. 580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Continue the loop with the shortest end point of either this code point 581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * or a matching set string. 582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no more set string matched after a previous string match, 583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Stop if spanLength==0, otherwise continue the loop. 585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * By noting each end point of a set string match, 587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the function visits each string position at most once and finishes 588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in linear time. 589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The recursive algorithm may visit the same string position many times 591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * if multiple paths lead to it and finishes in exponential time. 592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_SIMPLE) 596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm: 598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary: 599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the code point there is in the set, then remember to continue after it. 600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If a set string matches at the current position, then remember to continue after it. 601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Continue from the farthest match position and ignore all others. 602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If there is no match at the current position, 603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then stop and return the current position. 604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation: 606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption and spanSet as above.) 608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop: 611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Try to match each set string at the end of the spanLength. 612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that start with set-contained code points must be matched 613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with a partial overlap because the standard algorithm would have tried 614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to match them earlier. 615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Set strings that entirely consist of set-contained code points 616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * must be matched with a full overlap because the longest-match algorithm 617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * would hide set string matches that end earlier. 618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Such set strings need not be matched earlier inside the code point span 619ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because the standard algorithm would then have continued after 620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the set string match anyway. 621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ~ Remember the longest set string match (farthest end point) from the earliest 622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * starting point. 623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no set string matched after spanSet.span(), then return 624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with where the spanSet.span() ended. 625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If at least one set string matched, then continue the loop after the 626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * longest match from the earliest position. 627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If no more set string matched after a previous string match, 628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Stop if spanLength==0, otherwise continue the loop. 630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 632ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNot(s, length); 635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==length) { 638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; 639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength16); 646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanLength, rest=length-pos; 648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanLengths[i]; 653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length16-overlap; // Keep overlap+inc==length16. 670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest) { 672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the increment is not listed already. 675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc==rest) { 677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(inc); 680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxInc=0, maxOverlap=0; 690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanLengths[i]; 692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 693ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 694ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 695ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 698ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length16-overlap; // Keep overlap+inc==length16. 709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest || overlap<maxOverlap) { 711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or starts earlier. 714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches16CPB(s, pos-overlap, length, s16, length16) 716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxInc=inc; // Longest match from earliest start. 718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 726ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxInc!=0 || maxOverlap!=0) { 727ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 728ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue after it. 729ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=maxInc; 730ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=maxInc; 731ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(rest==0) { 732ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 733ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 734ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 735ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 736ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 737ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 738ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 739ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 740ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==0) { 741ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after an unlimited code point span (spanLength!=0), 742ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not after a string match. 743ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 after a span is pos==0. 744ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 745ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 746ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 747ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched after a span. 748ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 749ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 750ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 751ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after a string match (or a single code point). 752ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 753ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched after a previous string match. 754ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from after the last string match. 755ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 756ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( spanLength==rest || // Reached the end of the string, or 757ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 758ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 759ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos+spanLength; 760ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 761ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 762ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 763ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from after a span. 764ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 765ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from after a string match if some 766ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 767ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 768ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOne(spanSet, s+pos, rest); 769ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 770ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==rest) { 771ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 772ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 773ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings after this code point. 774ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any increments below it because UnicodeSet strings 775ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 776ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 777ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 778ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 779ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 780ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from after a single code point. 781ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 782ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 783ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 784ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 785ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t minOffset=offsets.popMinimum(); 786ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=minOffset; 787ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=minOffset; 788ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 789ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 790ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 791ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 792ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 793ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 794ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNotBack(s, length); 795ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 796ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 797ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 798ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 799ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 800ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=length-pos; 801ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 802ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 803ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 804ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 805ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 806ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength16); 807ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 808ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 809ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackLengths=spanLengths; 810ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 811ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackLengths+=stringsLength; 812ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 813ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 814ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 815ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 816ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackLengths[i]; 817ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 818ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 819ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 820ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 821ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 822ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 823ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 824ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length16-overlap)..pos-length16. 825ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 826ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 827ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 828ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len1=0; 829ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U16_FWD_1(s16, len1, overlap); 830ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap-=len1; // Length of the string minus the first code point. 831ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 832ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 833ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 834ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 835ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length16-overlap; // Keep dec+overlap==length16. 836ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 837ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos) { 838ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 839ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 840ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the decrement is not listed already. 841ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 842ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec==pos) { 843ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 844ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 845ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(dec); 846ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 847ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 848ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 849ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 850ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 851ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 852ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 853ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 854ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 855ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxDec=0, maxOverlap=0; 856ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 857ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackLengths[i]; 858ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 859ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 860ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 861ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 862ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 863ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 864ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 865ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length16-overlap)..pos-length16. 866ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 867ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length16; 868ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 869ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 870ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 871ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 872ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 873ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 874ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length16-overlap; // Keep dec+overlap==length16. 875ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 876ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos || overlap<maxOverlap) { 877ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 878ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 879ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or ends later. 880ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 881ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches16CPB(s, pos-dec, length, s16, length16) 882ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 883ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxDec=dec; // Longest match from latest end. 884ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 885ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 886ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 887ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 888ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 889ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 890ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 891ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 892ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxDec!=0 || maxOverlap!=0) { 893ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 894ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue before it. 895ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=maxDec; 896ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 897ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 898ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 899ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 900ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 901ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 902ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 903ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 904ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 905ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==length) { 906ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before an unlimited code point span (spanLength!=0), 907ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not before a string match. 908ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 before a span is pos==length. 909ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 910ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 911ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 912ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched before a span. 913ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 914ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 915ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 916ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before a string match (or a single code point). 917ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 918ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched before a previous string match. 919ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from before the last string match. 920ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t oldPos=pos; 921ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 922ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=oldPos-pos; 923ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( pos==0 || // Reached the start of the string, or 924ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 925ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 926ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; 927ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 928ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from before a span. 929ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 930ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from before a string match if some 931ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 932ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 933ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOneBack(spanSet, s, pos); 934ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 935ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==pos) { 936ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 937ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 938ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings before this code point. 939ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any decrements below it because UnicodeSet strings 940ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 941ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=spanLength; 942ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 943ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 944ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from before a single code point. 945ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 946ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 947ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 948ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 949ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=offsets.popMinimum(); 950ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 951ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 952ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 953ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 954ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 955ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 956ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNotUTF8(s, length); 957ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 958ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 959ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==length) { 960ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; 961ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 962ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 963ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 964ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 965ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 966ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 967ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength8); 968ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 969ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanLength, rest=length-pos; 970ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 971ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanUTF8Lengths=spanLengths; 972ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 973ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths+=2*stringsLength; 974ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 975ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 976ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 977ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 978ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 979ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 980ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 981ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 982ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 983ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 984ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanUTF8Lengths[i]; 985ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 986ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 987ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 988ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 989ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 990ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 991ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 992ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 993ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 994ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 995ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 996ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 997ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 998ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 999ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1000ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1001ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest) { 1002ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1003ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1004ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the increment is not listed already. 1005ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1006ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1007ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-overlap]) && 1008ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !offsets.containsOffset(inc) && 1009ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-overlap, s8, length8) 1010ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1011ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1012ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc==rest) { 1013ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1014ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1015ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(inc); 1016ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1017ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 1018ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1019ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1020ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1021ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 1022ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1023ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1024ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1025ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 1026ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxInc=0, maxOverlap=0; 1027ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1028ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1029ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 1030ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 1031ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1032ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanUTF8Lengths[i]; 1033ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 1034ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 1035ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1036ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-overlap..pos. 1037ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 1038ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 1039ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 1040ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the earliest start. 1041ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1042ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 1043ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 1044ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1045ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1046ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1047ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(inc>rest || overlap<maxOverlap) { 1048ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1049ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1050ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or starts earlier. 1051ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1052ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1053ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-overlap]) && 1054ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1055ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-overlap, s8, length8) 1056ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1057ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1058ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxInc=inc; // Longest match from earliest start. 1059ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 1060ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1061ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1062ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1063ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++inc; 1064ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1065ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1066ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1067ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1068ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxInc!=0 || maxOverlap!=0) { 1069ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 1070ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue after it. 1071ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=maxInc; 1072ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=maxInc; 1073ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(rest==0) { 1074ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1075ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1076ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 1077ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 1078ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1079ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1080ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 1081ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1082ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==0) { 1083ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after an unlimited code point span (spanLength!=0), 1084ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not after a string match. 1085ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 after a span is pos==0. 1086ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 1087ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 1088ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1089ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched after a span. 1090ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1091ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 1092ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1093ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is after a string match (or a single code point). 1094ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1095ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched after a previous string match. 1096ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from after the last string match. 1097ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1098ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( spanLength==rest || // Reached the end of the string, or 1099ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 1100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos+spanLength; 1102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 1104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 1105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from after a span. 1106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from after a string match if some 1108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 1109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 1110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOneUTF8(spanSet, s+pos, rest); 1111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 1112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==rest) { 1113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings after this code point. 1116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any increments below it because UnicodeSet strings 1117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 1118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=spanLength; 1119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=spanLength; 1120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 1121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 1122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from after a single code point. 1123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from after the next string match. 1125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t minOffset=offsets.popMinimum(); 1128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=minOffset; 1129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=minOffset; 1130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from after a string match. 1131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return spanNotBackUTF8(s, length); 1137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 1141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanLength=length-pos; 1143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Consider strings; they may overlap with the span. 1145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OffsetList offsets; 1146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 1147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use offset list to try all possibilities. 1148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.setMaxLength(maxLength8); 1149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackUTF8Lengths=spanLengths; 1152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 1153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths+=3*stringsLength; 1154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 1157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 1158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition==USET_SPAN_CONTAINED) { 1159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 1162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 1163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackUTF8Lengths[i]; 1165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==ALL_CP_CONTAINED) { 1166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 1168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length8-overlap)..pos-length8. 1171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 1172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 1173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // While contained: No point matching fully inside the code point span. 1174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t len1=0; 1175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U8_FWD_1(s8, len1, overlap); 1176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap-=len1; // Length of the string minus the first code point. 1177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 1179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 1180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos) { 1184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the decrement is not listed already. 1187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-dec]) && 1190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru !offsets.containsOffset(dec) && 1191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-dec, s8, length8) 1192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec==pos) { 1194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.addOffset(dec); 1197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap==0) { 1199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 1203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* USET_SPAN_SIMPLE */ { 1207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxDec=0, maxOverlap=0; 1208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8==0) { 1211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // String not representable in UTF-8. 1212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t overlap=spanBackUTF8Lengths[i]; 1214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // For longest match, we do need to try to match even an all-contained string 1215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 1216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match this string at pos-(length8-overlap)..pos-length8. 1218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>=LONG_SPAN) { 1219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=length8; 1220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest match: Need to match fully inside the code point span 1221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // to find the match from the latest end. 1222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(overlap>spanLength) { 1224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overlap=spanLength; 1225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 1228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dec>pos || overlap<maxOverlap) { 1229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match if the string is longer or ends later. 1232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match at code point boundaries. (The UTF-8 strings were converted 1233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // from UTF-16 and are guaranteed to be well-formed.) 1234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( !U8_IS_TRAIL(s[pos-dec]) && 1235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru matches8(s+pos-dec, s8, length8) 1237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxDec=dec; // Longest match from latest end. 1239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxOverlap=overlap; 1240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 1241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --overlap; 1243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++dec; 1244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxDec!=0 || maxOverlap!=0) { 1249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Longest-match algorithm, and there was a string match. 1250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Simply continue before it. 1251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=maxDec; 1252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 1256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 1257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finished trying to match all strings at pos. 1260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength!=0 || pos==length) { 1262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before an unlimited code point span (spanLength!=0), 1263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not before a string match. 1264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The only position where spanLength==0 before a span is pos==length. 1265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Otherwise, an unlimited code point span is only tried again when no 1266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // strings match, and if such a non-initial span fails we stop. 1267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // No strings matched before a span. 1269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 1271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The position is before a string match (or a single code point). 1273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(offsets.isEmpty()) { 1274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No more strings matched before a previous string match. 1275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try another code point span from before the last string match. 1276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t oldPos=pos; 1277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=oldPos-pos; 1279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( pos==0 || // Reached the start of the string, or 1280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength==0 // neither strings nor span progressed. 1281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 1282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; 1283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // spanLength>0: Match strings from before a span. 1285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 1286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match only one code point from before a string match if some 1287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // string matched beyond it, so that we try all possible positions 1288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and don't overshoot. 1289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=spanOneBackUTF8(spanSet, s, pos); 1290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength>0) { 1291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLength==pos) { 1292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings before this code point. 1295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // There cannot be any decrements below it because UnicodeSet strings 1296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contain multiple code points. 1297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=spanLength; 1298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offsets.shift(spanLength); 1299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; 1300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Match strings from before a single code point. 1301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Match strings from before the next string match. 1303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=offsets.popMinimum(); 1306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanLength=0; // Match strings from before a string match. 1307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 1311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm: 1314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary: 1315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the code point there is in the set, then return with the current position. 1316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If a set string matches at the current position, then return with the current position. 1317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation: 1319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption as for span() above.) 1321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanNotSet which contains all of the single code points 1323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings. 1324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For each set string add its initial code point to the spanNotSet. 1325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Also add its final code point for spanNotBack().) 1326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop: 1328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If the current code point is in the original set, then 1330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * return the current position. 1331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If any set string matches at the current position, then 1332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * return the current position. 1333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * + If there is no match at the current position, neither for the code point there 1334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * nor for any set string, then skip this code point and continue the loop. 1335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This happens for set-string-initial code points that were added to spanNotSet 1336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * when there is not actually a match for such a set string. 1337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 1338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=0, rest=length; 1341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i==rest) { 1347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=i; 1350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=i; 1351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOne(spanSet, s+pos, rest); 1355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLengths[i]==ALL_CP_CONTAINED) { 1362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 1363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 1366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 1367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=cpLength; 1376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest+=cpLength; 1377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(rest!=0); 1378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=length; 1383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOneBack(spanSet, s, pos); 1395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Use spanLengths rather than a spanBackLengths pointer because 1402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // it is easier and we only need to know whether the string is irrelevant 1403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // which is the same in either array. 1404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanLengths[i]==ALL_CP_CONTAINED) { 1405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; // Irrelevant string. 1406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *s16=string.getBuffer(); 1409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length16=string.length(); 1410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=cpLength; 1419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(pos!=0); 1420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=0, rest=length; 1425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanUTF8Lengths=spanLengths; 1427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 1428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanUTF8Lengths+=2*stringsLength; 1429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i==rest) { 1435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=i; 1438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest-=i; 1439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 1449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 1450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // ALL_CP_CONTAINED: Irrelevant string. 1453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos-=cpLength; 1463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rest+=cpLength; 1464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(rest!=0); 1465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length; // Reached the end of the string. 1466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pos=length; 1470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, stringsLength=strings.size(); 1471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *spanBackUTF8Lengths=spanLengths; 1472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(all) { 1473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanBackUTF8Lengths+=3*stringsLength; 1474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 1476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Span until we find a code point from the set, 1477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // or a code point that starts or ends some string. 1478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pos==0) { 1480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Check whether the current code point is in the original set, 1484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the string starts and ends. 1485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(cpLength>0) { 1487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Try to match the strings at pos. 1491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *s8=utf8; 1492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length8; 1493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<stringsLength; ++i) { 1494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length8=utf8Lengths[i]; 1495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // ALL_CP_CONTAINED: Irrelevant string. 1496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return pos; // There is a set element at pos. 1498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s8+=length8; 1500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The span(while not contained) ended on a string start/end which is 1503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // not in the original set. Skip this code point and continue. 1504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // cpLength<0 1505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pos+=cpLength; 1506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(pos!=0); 1507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Reached the start of the string. 1508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 1509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 1511