1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)****************************************************************************** 3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Copyright (C) 2007, International Business Machines 5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Corporation and others. All Rights Reserved. 6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)****************************************************************************** 8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* file name: unisetspan.cpp 9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* encoding: US-ASCII 10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* tab size: 8 (not used) 11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* indentation:4 12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* created on: 2007mar01 14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* created by: Markus W. Scherer 15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h" 18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uniset.h" 19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ustring.h" 20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "cmemory.h" 21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uvector.h" 22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unisetspan.h" 23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_BEGIN 25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * List of offsets from the current position from where to try matching 28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * a code point or a string. 29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Store offsets rather than indexes to simplify the code and use the same list 30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for both increments (in span()) and decrements (in spanBack()). 31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Assumption: The maximum offset is limited, and the offsets that are stored 33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * at any one time are relatively dense, that is, there are normally no gaps of 34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * hundreds or thousands of offset values. 35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The implementation uses a circular buffer of byte flags, 37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * each indicating whether the corresponding offset is in the list. 38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This avoids inserting into a sorted list of offsets (or absolute indexes) and 39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * physically moving part of the list. 40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Note: In principle, the caller should setMaxLength() to the maximum of the 42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * max string length and U16_LENGTH/U8_LENGTH to account for 43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "long" single code points. 44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * However, this implementation uses at least a staticList with more than 45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * U8_LENGTH entries anyway. 46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Note: If maxLength were guaranteed to be no more than 32 or 64, 48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the list could be stored as bit flags in a single integer. 49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Rather than handling a circular buffer with a start list index, 50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the integer would simply be shifted when lower offsets are removed. 51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * UnicodeSet does not have a limit on the lengths of strings. 52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)public: 55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ~OffsetList() { 58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(list!=staticList) { 59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(list); 60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Call exactly once if the list is to be used. 64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) void setMaxLength(int32_t maxLength) { 65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(maxLength<=(int32_t)sizeof(staticList)) { 66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) capacity=(int32_t)sizeof(staticList); 67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool *l=(UBool *)uprv_malloc(maxLength); 69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(l!=NULL) { 70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) list=l; 71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) capacity=maxLength; 72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_memset(list, 0, capacity); 75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) void clear() { 78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_memset(list, 0, capacity); 79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) start=length=0; 80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isEmpty() const { 83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (UBool)(length==0); 84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Reduce all stored offsets by delta, used when the current position 87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // moves by delta. 88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // There must not be any offsets lower than delta. 89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If there is an offset equal to delta, it is removed. 90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // delta=[1..maxLength] 91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) void shift(int32_t delta) { 92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i=start+delta; 93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(i>=capacity) { 94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i-=capacity; 95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(list[i]) { 97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) list[i]=FALSE; 98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --length; 99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) start=i; 101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Add an offset. The list must not contain it yet. 104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // offset=[1..maxLength] 105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) void addOffset(int32_t offset) { 106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i=start+offset; 107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(i>=capacity) { 108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i-=capacity; 109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) list[i]=TRUE; 111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++length; 112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // offset=[1..maxLength] 115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool containsOffset(int32_t offset) const { 116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i=start+offset; 117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(i>=capacity) { 118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i-=capacity; 119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return list[i]; 121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Find the lowest stored offset from a non-empty list, remove it, 124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and reduce all other offsets by this minimum. 125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Returns [1..maxLength]. 126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t popMinimum() { 127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Look for the next offset in list[start+1..capacity-1]. 128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i=start, result; 129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(++i<capacity) { 130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(list[i]) { 131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) list[i]=FALSE; 132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --length; 133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result=i-start; 134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) start=i; 135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // i==capacity 139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Wrap around and look for the next offset in list[0..start]. 141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Since the list is not empty, there will be one. 142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result=capacity-start; 143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i=0; 144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(!list[i]) { 145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++i; 146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) list[i]=FALSE; 148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --length; 149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) start=i; 150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result+=i; 151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)private: 154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool *list; 155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t capacity; 156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length; 157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start; 158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool staticList[16]; 160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Get the number of UTF-8 bytes for a UTF-16 (sub)string. 163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static int32_t 164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)getUTF8Length(const UChar *s, int32_t length) { 165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode errorCode=U_ZERO_ERROR; 166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8=0; 167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length8; 170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The string contains an unpaired surrogate. 172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Ignore this string. 173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static int32_t 179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode errorCode=U_ZERO_ERROR; 181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8=0; 182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(U_SUCCESS(errorCode)) { 184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length8; 185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The string contains an unpaired surrogate. 187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Ignore this string. 188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline uint8_t 193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)makeSpanLengthByte(int32_t spanLength) { 194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 0xfe==UnicodeSetStringSpan::LONG_SPAN 195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Construct for all variants of span(), or only for any one variant. 199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Initialize as little as possible, for single use. 200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UVector &setStrings, 202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint32_t which) 203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Length(0), 206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength16(0), maxLength8(0), 207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) all((UBool)(which==ALL)) { 208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanSet.retainAll(set); 209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&NOT_CONTAINED) { 210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Default to the same sets. 211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // addToSpanNotSet() will create a separate set if necessary. 212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pSpanNotSet=&spanSet; 213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Determine if the strings even need to be taken into account at all for span() etc. 216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If any string is relevant, then all strings need to be used for 217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // span(longest match) but only the relevant ones for span(while contained). 218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t stringsLength=strings.size(); 223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, spanLength; 225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool someRelevant=FALSE; 226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool thisRelevant; 231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength<length16) { // Relevant string. 233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) someRelevant=thisRelevant=TRUE; 234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) thisRelevant=FALSE; 236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((which&UTF16) && length16>maxLength16) { 238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength16=length16; 239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8=getUTF8Length(s16, length16); 242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Length+=length8; 243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8>maxLength8) { 244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength8=length8; 245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(!someRelevant) { 249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength16=maxLength8=0; 250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Freeze after checking for the need to use strings at all because freezing 254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // a set takes some time and memory which are wasted if there are no relevant strings. 255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanSet.freeze(); 257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanBackLengths; 260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanUTF8Lengths; 261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanBackUTF8Lengths; 262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Allocate a block of meta data. 264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t allocSize; 265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) allocSize=stringsLength; // One set of span lengths. 270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&UTF8) { 271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // UTF-8 lengths and UTF-8 strings. 272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) allocSize+=stringsLength*4+utf8Length; 273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(allocSize<=(int32_t)sizeof(staticLengths)) { 276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths=staticLengths; 277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths=(int32_t *)uprv_malloc(allocSize); 279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(utf8Lengths==NULL) { 280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; // Out of memory. 282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Store span lengths for all span() variants. 287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackLengths=spanLengths+stringsLength; 289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths=spanBackLengths+stringsLength; 290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8=spanBackUTF8Lengths+stringsLength; 292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Store span lengths for only one span() variant. 294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&UTF8) { 295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8=spanLengths+stringsLength; 297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths=(uint8_t *)utf8Lengths; 299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Set the meta data and pSpanNotSet and write the UTF-8 strings. 304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t utf8Count=0; // Count UTF-8 bytes written so far. 305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength<length16) { // Relevant string. 312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&UTF16) { 313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&CONTAINED) { 314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&FWD) { 315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths[i]=makeSpanLengthByte(spanLength); 316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&BACK) { 318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackLengths[i]=makeSpanLengthByte(spanLength); 320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&UTF8) { 326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *s8=utf8+utf8Count; 327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Count+=utf8Lengths[i]=length8; 329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { // Relevant for UTF-8. 332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&CONTAINED) { 333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&FWD) { 334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&BACK) { 338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&NOT_CONTAINED) { 347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Add string start and end code points to the spanNotSet so that 348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // a span(while not contained) stops before any string. 349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c; 350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&FWD) { 351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t len=0; 352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_NEXT(s16, len, length16, c); 353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) addToSpanNotSet(c); 354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&BACK) { 356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t len=length16; 357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_PREV(s16, 0, len, c); 358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) addToSpanNotSet(c); 359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { // Irrelevant string. 362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&UTF8) { 363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *s8=utf8+utf8Count; 365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Count+=utf8Lengths[i]=length8; 367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths[i]=0; 369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths[i]=spanBackLengths[i]= 373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (uint8_t)ALL_CP_CONTAINED; 375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // All spanXYZLengths pointers contain the same address. 377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Finish. 383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pSpanNotSet->freeze(); 385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Copy constructor. Assumes which==ALL for a frozen set. 389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UVector &newParentSetStrings) 391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Length(otherStringSpan.utf8Length), 394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) all(TRUE) { 396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pSpanNotSet=&spanSet; 398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Allocate a block of meta data. 403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t stringsLength=strings.size(); 405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(allocSize<=(int32_t)sizeof(staticLengths)) { 407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths=staticLengths; 408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8Lengths=(int32_t *)uprv_malloc(allocSize); 410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(utf8Lengths==NULL) { 411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; // Out of memory. 413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) utf8=spanLengths+stringsLength*4; 418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSetStringSpan::~UnicodeSetStringSpan() { 422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete pSpanNotSet; 424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(utf8Lengths); 427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanSet.contains(c)) { 433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; // Nothing to do. 434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(newSet==NULL) { 437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; // Out of memory. 438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pSpanNotSet=newSet; 440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pSpanNotSet->add(c); 443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Compare strings without any argument checks. Requires length>0. 446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline UBool 447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)matches16(const UChar *s, const UChar *t, int32_t length) { 448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(*s++!=*t++) { 450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return FALSE; 451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } while(--length>0); 453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return TRUE; 454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline UBool 457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(*s++!=*t++) { 460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return FALSE; 461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } while(--length>0); 463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return TRUE; 464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Compare 16-bit Unicode strings (which may be malformed UTF-16) 467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// at code point boundaries. 468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// That is, each edge of a match must not be in the middle of a surrogate pair. 469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline UBool 470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s+=start; 472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) limit-=start; 473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return matches16(s, t, length) && 474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Does the set contain the next code point? 479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// If so, return its length; otherwise return its negative length. 480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t 481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar c=*s, c2; 483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(c) ? 1 : -1; 487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t 490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar c=s[length-1], c2; 492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(c) ? 1 : -1; 496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t 499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c=*s; 501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((int8_t)c>=0) { 502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(c) ? 1 : -1; 503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Take advantage of non-ASCII fastpaths in U8_NEXT(). 505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i=0; 506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_NEXT(s, i, length, c); 507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(c) ? i : -i; 508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 509f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 510f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t 511f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 512f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c=s[length-1]; 513f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((int8_t)c>=0) { 514f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(c) ? 1 : -1; 515f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 516f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i=length-1; 517f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c=utf8_prevCharSafeBody(s, 0, &i, c, -1); 518f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length-=i; 519f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set.contains(c) ? length : -length; 520f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 521f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 522f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 523f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Note: In span() when spanLength==0 (after a string match, or at the beginning 524f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * after an empty code point span) and in spanNot() and spanNotUTF8(), 525f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * string matching could use a binary search 526f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * because all string matches are done from the same start index. 527f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 528f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For UTF-8, this would require a comparison function that returns UTF-16 order. 529f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 530f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This optimization should not be necessary for normal UnicodeSets because 531f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * most sets have no strings, and most sets with strings have 532f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * very few very short strings. 533f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For cases with many strings, it might be better to use a different API 534f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and implementation with a DFA (state machine). 535f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 536f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 537f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 538f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Algorithm for span(USET_SPAN_CONTAINED) 539f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 540f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Theoretical algorithm: 541f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Iterate through the string, and at each code point boundary: 542f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If the code point there is in the set, then remember to continue after it. 543f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If a set string matches at the current position, then remember to continue after it. 544f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + Either recursively span for each code point or string match, 545f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * or recursively span for all but the shortest one and 546f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * iteratively continue the span with the shortest local match. 547f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + Remember the longest recursive span (the farthest end point). 548f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If there is no match at the current position, neither for the code point there 549f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * nor for any set string, then stop and return the longest recursive span length. 550f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 551f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Optimized implementation: 552f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 553f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (We assume that most sets will have very few very short strings. 554f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * A span using a string-less set is extremely fast.) 555f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 556f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Create and cache a spanSet which contains all of the single code points 557f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of the original set but none of its strings. 558f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 559f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 560f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Loop: 561f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + Try to match each set string at the end of the spanLength. 562f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ~ Set strings that start with set-contained code points must be matched 563f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * with a partial overlap because the recursive algorithm would have tried 564f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * to match them at every position. 565f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ~ Set strings that entirely consist of set-contained code points 566f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * are irrelevant for span(USET_SPAN_CONTAINED) because the 567f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * recursive algorithm would continue after them anyway 568f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and find the longest recursive match from their end. 569f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ~ Rather than recursing, note each end point of a set string match. 570f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If no set string matched after spanSet.span(), then return 571f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * with where the spanSet.span() ended. 572f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If at least one set string matched after spanSet.span(), then 573f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * pop the shortest string match end point and continue 574f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the loop, trying to match all set strings from there. 575f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If at least one more set string matched after a previous string match, 576f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then test if the code point after the previous string match is also 577f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * contained in the set. 578f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Continue the loop with the shortest end point of either this code point 579f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * or a matching set string. 580f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If no more set string matched after a previous string match, 581f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 582f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Stop if spanLength==0, otherwise continue the loop. 583f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 584f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * By noting each end point of a set string match, 585f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the function visits each string position at most once and finishes 586f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * in linear time. 587f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 588f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The recursive algorithm may visit the same string position many times 589f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * if multiple paths lead to it and finishes in exponential time. 590f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 591f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 592f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 593f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Algorithm for span(USET_SPAN_SIMPLE) 594f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 595f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Theoretical algorithm: 596f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Iterate through the string, and at each code point boundary: 597f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If the code point there is in the set, then remember to continue after it. 598f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If a set string matches at the current position, then remember to continue after it. 599f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + Continue from the farthest match position and ignore all others. 600f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If there is no match at the current position, 601f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then stop and return the current position. 602f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 603f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Optimized implementation: 604f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 605f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Same assumption and spanSet as above.) 606f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 607f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 608f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Loop: 609f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + Try to match each set string at the end of the spanLength. 610f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ~ Set strings that start with set-contained code points must be matched 611f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * with a partial overlap because the standard algorithm would have tried 612f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * to match them earlier. 613f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ~ Set strings that entirely consist of set-contained code points 614f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * must be matched with a full overlap because the longest-match algorithm 615f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * would hide set string matches that end earlier. 616f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Such set strings need not be matched earlier inside the code point span 617f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * because the standard algorithm would then have continued after 618f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the set string match anyway. 619f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ~ Remember the longest set string match (farthest end point) from the earliest 620f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * starting point. 621f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If no set string matched after spanSet.span(), then return 622f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * with where the spanSet.span() ended. 623f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If at least one set string matched, then continue the loop after the 624f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * longest match from the earliest position. 625f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If no more set string matched after a previous string match, 626f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 627f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Stop if spanLength==0, otherwise continue the loop. 628f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 629f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 630f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 631f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_NOT_CONTAINED) { 632f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return spanNot(s, length); 633f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 634f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 635f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength==length) { 636f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; 637f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 638f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 639f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Consider strings; they may overlap with the span. 640f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) OffsetList offsets; 641f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 642f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Use offset list to try all possibilities. 643f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.setMaxLength(maxLength16); 644f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 645f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=spanLength, rest=length-pos; 646f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 647f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 648f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 649f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 650f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanLengths[i]; 651f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==ALL_CP_CONTAINED) { 652f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Irrelevant string. 653f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 654f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 655f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 656f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 657f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 658f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-overlap..pos. 659f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 660f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length16; 661f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // While contained: No point matching fully inside the code point span. 662f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 663f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 664f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 665f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 666f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 667f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t inc=length16-overlap; // Keep overlap+inc==length16. 668f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 669f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(inc>rest) { 670f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 671f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 672f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the increment is not listed already. 673f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 674f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(inc==rest) { 675f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 676f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 677f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.addOffset(inc); 678f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 679f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==0) { 680f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 681f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 682f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 683f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++inc; 684f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 685f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 686f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else /* USET_SPAN_SIMPLE */ { 687f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t maxInc=0, maxOverlap=0; 688f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 689f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanLengths[i]; 690f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // For longest match, we do need to try to match even an all-contained string 691f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the earliest start. 692f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 693f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 694f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 695f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 696f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 697f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-overlap..pos. 698f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 699f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length16; 700f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest match: Need to match fully inside the code point span 701f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the earliest start. 702f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 703f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 704f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 705f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 706f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t inc=length16-overlap; // Keep overlap+inc==length16. 707f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 708f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(inc>rest || overlap<maxOverlap) { 709f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 710f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 711f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the string is longer or starts earlier. 712f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 713f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) matches16CPB(s, pos-overlap, length, s16, length16) 714f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 715f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxInc=inc; // Longest match from earliest start. 716f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxOverlap=overlap; 717f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 718f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 719f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 720f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++inc; 721f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 722f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 723f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 724f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(maxInc!=0 || maxOverlap!=0) { 725f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest-match algorithm, and there was a string match. 726f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Simply continue after it. 727f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=maxInc; 728f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=maxInc; 729f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(rest==0) { 730f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 731f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 732f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from after a string match. 733f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 734f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 735f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 736f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Finished trying to match all strings at pos. 737f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 738f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength!=0 || pos==0) { 739f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is after an unlimited code point span (spanLength!=0), 740f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not after a string match. 741f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The only position where spanLength==0 after a span is pos==0. 742f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Otherwise, an unlimited code point span is only tried again when no 743f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // strings match, and if such a non-initial span fails we stop. 744f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 745f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // No strings matched after a span. 746f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 747f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from after the next string match. 748f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 749f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is after a string match (or a single code point). 750f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 751f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // No more strings matched after a previous string match. 752f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try another code point span from after the last string match. 753f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 754f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( spanLength==rest || // Reached the end of the string, or 755f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength==0 // neither strings nor span progressed. 756f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 757f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos+spanLength; 758f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 759f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=spanLength; 760f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=spanLength; 761f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // spanLength>0: Match strings from after a span. 762f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 763f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match only one code point from after a string match if some 764f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // string matched beyond it, so that we try all possible positions 765f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and don't overshoot. 766f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanOne(spanSet, s+pos, rest); 767f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength>0) { 768f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength==rest) { 769f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 770f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 771f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings after this code point. 772f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // There cannot be any increments below it because UnicodeSet strings 773f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // contain multiple code points. 774f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=spanLength; 775f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=spanLength; 776f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.shift(spanLength); 777f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; 778f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Match strings from after a single code point. 779f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 780f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from after the next string match. 781f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 782f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 783f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t minOffset=offsets.popMinimum(); 784f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=minOffset; 785f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=minOffset; 786f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from after a string match. 787f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 788f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 789f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 790f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 791f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_NOT_CONTAINED) { 792f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return spanNotBack(s, length); 793f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 794f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 795f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pos==0) { 796f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 797f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 798f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t spanLength=length-pos; 799f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 800f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Consider strings; they may overlap with the span. 801f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) OffsetList offsets; 802f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 803f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Use offset list to try all possibilities. 804f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.setMaxLength(maxLength16); 805f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 806f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 807f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanBackLengths=spanLengths; 808f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 809f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackLengths+=stringsLength; 810f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 811f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 812f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 813f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 814f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanBackLengths[i]; 815f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==ALL_CP_CONTAINED) { 816f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Irrelevant string. 817f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 818f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 819f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 820f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 821f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 822f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-(length16-overlap)..pos-length16. 823f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 824f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length16; 825f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // While contained: No point matching fully inside the code point span. 826f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t len1=0; 827f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_FWD_1(s16, len1, overlap); 828f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap-=len1; // Length of the string minus the first code point. 829f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 830f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 831f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 832f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 833f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t dec=length16-overlap; // Keep dec+overlap==length16. 834f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 835f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(dec>pos) { 836f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 837f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 838f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the decrement is not listed already. 839f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 840f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(dec==pos) { 841f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 842f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 843f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.addOffset(dec); 844f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 845f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==0) { 846f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 847f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 848f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 849f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++dec; 850f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 851f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 852f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else /* USET_SPAN_SIMPLE */ { 853f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t maxDec=0, maxOverlap=0; 854f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 855f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanBackLengths[i]; 856f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // For longest match, we do need to try to match even an all-contained string 857f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the latest end. 858f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 859f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 860f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 861f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 862f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 863f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-(length16-overlap)..pos-length16. 864f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 865f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length16; 866f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest match: Need to match fully inside the code point span 867f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the latest end. 868f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 869f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 870f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 871f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 872f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t dec=length16-overlap; // Keep dec+overlap==length16. 873f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 874f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(dec>pos || overlap<maxOverlap) { 875f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 876f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 877f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the string is longer or ends later. 878f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 879f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) matches16CPB(s, pos-dec, length, s16, length16) 880f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 881f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxDec=dec; // Longest match from latest end. 882f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxOverlap=overlap; 883f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 884f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 885f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 886f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++dec; 887f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 888f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 889f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 890f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(maxDec!=0 || maxOverlap!=0) { 891f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest-match algorithm, and there was a string match. 892f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Simply continue before it. 893f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=maxDec; 894f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pos==0) { 895f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 896f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 897f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from before a string match. 898f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 899f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 900f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 901f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Finished trying to match all strings at pos. 902f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 903f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength!=0 || pos==length) { 904f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is before an unlimited code point span (spanLength!=0), 905f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not before a string match. 906f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The only position where spanLength==0 before a span is pos==length. 907f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Otherwise, an unlimited code point span is only tried again when no 908f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // strings match, and if such a non-initial span fails we stop. 909f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 910f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // No strings matched before a span. 911f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 912f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from before the next string match. 913f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 914f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is before a string match (or a single code point). 915f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 916f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // No more strings matched before a previous string match. 917f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try another code point span from before the last string match. 918f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t oldPos=pos; 919f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 920f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=oldPos-pos; 921f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( pos==0 || // Reached the start of the string, or 922f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength==0 // neither strings nor span progressed. 923f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 924f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 925f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 926f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // spanLength>0: Match strings from before a span. 927f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 928f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match only one code point from before a string match if some 929f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // string matched beyond it, so that we try all possible positions 930f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and don't overshoot. 931f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanOneBack(spanSet, s, pos); 932f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength>0) { 933f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength==pos) { 934f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 935f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 936f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings before this code point. 937f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // There cannot be any decrements below it because UnicodeSet strings 938f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // contain multiple code points. 939f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=spanLength; 940f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.shift(spanLength); 941f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; 942f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Match strings from before a single code point. 943f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 944f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from before the next string match. 945f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 946f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 947f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=offsets.popMinimum(); 948f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from before a string match. 949f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 950f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 951f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 952f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 953f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_NOT_CONTAINED) { 954f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return spanNotUTF8(s, length); 955f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 956f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 957f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength==length) { 958f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; 959f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 960f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 961f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Consider strings; they may overlap with the span. 962f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) OffsetList offsets; 963f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 964f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Use offset list to try all possibilities. 965f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.setMaxLength(maxLength8); 966f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 967f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=spanLength, rest=length-pos; 968f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 969f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanUTF8Lengths=spanLengths; 970f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 971f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths+=2*stringsLength; 972f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 973f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 974f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const uint8_t *s8=utf8; 975f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8; 976f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 977f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 978f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length8=utf8Lengths[i]; 979f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8==0) { 980f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // String not representable in UTF-8. 981f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 982f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanUTF8Lengths[i]; 983f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==ALL_CP_CONTAINED) { 984f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 985f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Irrelevant string. 986f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 987f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 988f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-overlap..pos. 989f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 990f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length8; 991f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // While contained: No point matching fully inside the code point span. 992f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 993f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 994f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 995f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 996f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 997f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t inc=length8-overlap; // Keep overlap+inc==length8. 998f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 999f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(inc>rest) { 1000f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1001f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1002f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the increment is not listed already. 1003f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match at code point boundaries. (The UTF-8 strings were converted 1004f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // from UTF-16 and are guaranteed to be well-formed.) 1005f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( !U8_IS_TRAIL(s[pos-overlap]) && 1006f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !offsets.containsOffset(inc) && 1007f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) matches8(s+pos-overlap, s8, length8) 1008f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1009f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 1010f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(inc==rest) { 1011f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1012f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1013f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.addOffset(inc); 1014f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1015f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==0) { 1016f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1017f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1018f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 1019f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++inc; 1020f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1021f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1022f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1023f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else /* USET_SPAN_SIMPLE */ { 1024f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t maxInc=0, maxOverlap=0; 1025f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1026f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length8=utf8Lengths[i]; 1027f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8==0) { 1028f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // String not representable in UTF-8. 1029f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1030f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanUTF8Lengths[i]; 1031f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // For longest match, we do need to try to match even an all-contained string 1032f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the earliest start. 1033f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1034f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-overlap..pos. 1035f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 1036f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length8; 1037f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest match: Need to match fully inside the code point span 1038f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the earliest start. 1039f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1040f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 1041f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 1042f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1043f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1044f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 1045f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(inc>rest || overlap<maxOverlap) { 1046f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1047f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1048f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the string is longer or starts earlier. 1049f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match at code point boundaries. (The UTF-8 strings were converted 1050f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // from UTF-16 and are guaranteed to be well-formed.) 1051f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( !U8_IS_TRAIL(s[pos-overlap]) && 1052f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1053f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) matches8(s+pos-overlap, s8, length8) 1054f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1055f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 1056f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxInc=inc; // Longest match from earliest start. 1057f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxOverlap=overlap; 1058f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1059f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1060f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 1061f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++inc; 1062f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1063f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1064f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1065f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1066f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(maxInc!=0 || maxOverlap!=0) { 1067f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest-match algorithm, and there was a string match. 1068f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Simply continue after it. 1069f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=maxInc; 1070f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=maxInc; 1071f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(rest==0) { 1072f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1073f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1074f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from after a string match. 1075f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 1076f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1077f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1078f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Finished trying to match all strings at pos. 1079f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1080f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength!=0 || pos==0) { 1081f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is after an unlimited code point span (spanLength!=0), 1082f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not after a string match. 1083f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The only position where spanLength==0 after a span is pos==0. 1084f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Otherwise, an unlimited code point span is only tried again when no 1085f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // strings match, and if such a non-initial span fails we stop. 1086f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 1087f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // No strings matched after a span. 1088f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1089f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from after the next string match. 1090f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1091f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is after a string match (or a single code point). 1092f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 1093f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // No more strings matched after a previous string match. 1094f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try another code point span from after the last string match. 1095f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1096f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( spanLength==rest || // Reached the end of the string, or 1097f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength==0 // neither strings nor span progressed. 1098f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 1099f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos+spanLength; 1100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=spanLength; 1102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=spanLength; 1103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // spanLength>0: Match strings from after a span. 1104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match only one code point from after a string match if some 1106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // string matched beyond it, so that we try all possible positions 1107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and don't overshoot. 1108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanOneUTF8(spanSet, s+pos, rest); 1109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength>0) { 1110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength==rest) { 1111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings after this code point. 1114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // There cannot be any increments below it because UnicodeSet strings 1115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // contain multiple code points. 1116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=spanLength; 1117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=spanLength; 1118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.shift(spanLength); 1119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; 1120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Match strings from after a single code point. 1121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from after the next string match. 1123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t minOffset=offsets.popMinimum(); 1126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=minOffset; 1127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=minOffset; 1128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from after a string match. 1129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return spanNotBackUTF8(s, length); 1135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pos==0) { 1138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 1139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t spanLength=length-pos; 1141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Consider strings; they may overlap with the span. 1143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) OffsetList offsets; 1144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 1145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Use offset list to try all possibilities. 1146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.setMaxLength(maxLength8); 1147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 1149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanBackUTF8Lengths=spanLengths; 1150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 1151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackUTF8Lengths+=3*stringsLength; 1152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 1154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const uint8_t *s8=utf8; 1155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8; 1156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanCondition==USET_SPAN_CONTAINED) { 1157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length8=utf8Lengths[i]; 1159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8==0) { 1160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // String not representable in UTF-8. 1161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanBackUTF8Lengths[i]; 1163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==ALL_CP_CONTAINED) { 1164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Irrelevant string. 1166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-(length8-overlap)..pos-length8. 1169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 1170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length8; 1171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // While contained: No point matching fully inside the code point span. 1172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t len1=0; 1173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_FWD_1(s8, len1, overlap); 1174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap-=len1; // Length of the string minus the first code point. 1175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 1177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 1178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 1181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(dec>pos) { 1182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the decrement is not listed already. 1185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match at code point boundaries. (The UTF-8 strings were converted 1186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // from UTF-16 and are guaranteed to be well-formed.) 1187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( !U8_IS_TRAIL(s[pos-dec]) && 1188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !offsets.containsOffset(dec) && 1189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) matches8(s+pos-dec, s8, length8) 1190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 1191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(dec==pos) { 1192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.addOffset(dec); 1195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap==0) { 1197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 1200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++dec; 1201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else /* USET_SPAN_SIMPLE */ { 1205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t maxDec=0, maxOverlap=0; 1206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length8=utf8Lengths[i]; 1208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8==0) { 1209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // String not representable in UTF-8. 1210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t overlap=spanBackUTF8Lengths[i]; 1212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // For longest match, we do need to try to match even an all-contained string 1213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the latest end. 1214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match this string at pos-(length8-overlap)..pos-length8. 1216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>=LONG_SPAN) { 1217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=length8; 1218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest match: Need to match fully inside the code point span 1219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // to find the match from the latest end. 1220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(overlap>spanLength) { 1222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) overlap=spanLength; 1223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { 1226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(dec>pos || overlap<maxOverlap) { 1227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match if the string is longer or ends later. 1230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match at code point boundaries. (The UTF-8 strings were converted 1231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // from UTF-16 and are guaranteed to be well-formed.) 1232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( !U8_IS_TRAIL(s[pos-dec]) && 1233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) matches8(s+pos-dec, s8, length8) 1235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 1236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxDec=dec; // Longest match from latest end. 1237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) maxOverlap=overlap; 1238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --overlap; 1241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++dec; 1242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(maxDec!=0 || maxOverlap!=0) { 1247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Longest-match algorithm, and there was a string match. 1248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Simply continue before it. 1249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=maxDec; 1250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pos==0) { 1251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from before a string match. 1254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 1255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Finished trying to match all strings at pos. 1258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength!=0 || pos==length) { 1260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is before an unlimited code point span (spanLength!=0), 1261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not before a string match. 1262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The only position where spanLength==0 before a span is pos==length. 1263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Otherwise, an unlimited code point span is only tried again when no 1264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // strings match, and if such a non-initial span fails we stop. 1265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 1266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // No strings matched before a span. 1267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from before the next string match. 1269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The position is before a string match (or a single code point). 1271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(offsets.isEmpty()) { 1272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // No more strings matched before a previous string match. 1273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try another code point span from before the last string match. 1274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t oldPos=pos; 1275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=oldPos-pos; 1277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( pos==0 || // Reached the start of the string, or 1278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength==0 // neither strings nor span progressed. 1279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { 1280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 1281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // spanLength>0: Match strings from before a span. 1283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match only one code point from before a string match if some 1285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // string matched beyond it, so that we try all possible positions 1286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and don't overshoot. 1287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=spanOneBackUTF8(spanSet, s, pos); 1288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength>0) { 1289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLength==pos) { 1290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings before this code point. 1293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // There cannot be any decrements below it because UnicodeSet strings 1294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // contain multiple code points. 1295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=spanLength; 1296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) offsets.shift(spanLength); 1297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; 1298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Match strings from before a single code point. 1299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Match strings from before the next string match. 1301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=offsets.popMinimum(); 1304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanLength=0; // Match strings from before a string match. 1305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 1309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Theoretical algorithm: 1312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Iterate through the string, and at each code point boundary: 1313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If the code point there is in the set, then return with the current position. 1314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If a set string matches at the current position, then return with the current position. 1315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Optimized implementation: 1317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Same assumption as for span() above.) 1319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Create and cache a spanNotSet which contains all of the single code points 1321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of the original set but none of its strings. 1322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For each set string add its initial code point to the spanNotSet. 1323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Also add its final code point for spanNotBack().) 1324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Loop: 1326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If the current code point is in the original set, then 1328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * return the current position. 1329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If any set string matches at the current position, then 1330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * return the current position. 1331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * + If there is no match at the current position, neither for the code point there 1332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * nor for any set string, then skip this code point and continue the loop. 1333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This happens for set-string-initial code points that were added to spanNotSet 1334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * when there is not actually a match for such a set string. 1335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=0, rest=length; 1339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 1340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 1341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Span until we find a code point from the set, 1342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // or a code point that starts or ends some string. 1343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(i==rest) { 1345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=i; 1348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=i; 1349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Check whether the current code point is in the original set, 1351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // without the string starts and ends. 1352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cpLength=spanOne(spanSet, s+pos, rest); 1353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(cpLength>0) { 1354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match the strings at pos. 1358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLengths[i]==ALL_CP_CONTAINED) { 1360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Irrelevant string. 1361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 1364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 1365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The span(while not contained) ended on a string start/end which is 1371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not in the original set. Skip this code point and continue. 1372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // cpLength<0 1373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=cpLength; 1374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest+=cpLength; 1375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } while(rest!=0); 1376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=length; 1381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 1382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 1383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Span until we find a code point from the set, 1384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // or a code point that starts or ends some string. 1385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pos==0) { 1387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Check whether the current code point is in the original set, 1391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // without the string starts and ends. 1392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cpLength=spanOneBack(spanSet, s, pos); 1393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(cpLength>0) { 1394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match the strings at pos. 1398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Use spanLengths rather than a spanBackLengths pointer because 1400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // it is easier and we only need to know whether the string is irrelevant 1401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // which is the same in either array. 1402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(spanLengths[i]==ALL_CP_CONTAINED) { 1403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Irrelevant string. 1404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s16=string.getBuffer(); 1407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length16=string.length(); 1408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The span(while not contained) ended on a string start/end which is 1414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not in the original set. Skip this code point and continue. 1415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // cpLength<0 1416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=cpLength; 1417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } while(pos!=0); 1418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=0, rest=length; 1423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 1424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanUTF8Lengths=spanLengths; 1425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 1426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanUTF8Lengths+=2*stringsLength; 1427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 1429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Span until we find a code point from the set, 1430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // or a code point that starts or ends some string. 1431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(i==rest) { 1433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=i; 1436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest-=i; 1437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Check whether the current code point is in the original set, 1439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // without the string starts and ends. 1440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(cpLength>0) { 1442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match the strings at pos. 1446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const uint8_t *s8=utf8; 1447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8; 1448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length8=utf8Lengths[i]; 1450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // ALL_CP_CONTAINED: Irrelevant string. 1451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The span(while not contained) ended on a string start/end which is 1458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not in the original set. Skip this code point and continue. 1459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // cpLength<0 1460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos-=cpLength; 1461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rest+=cpLength; 1462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } while(rest!=0); 1463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return length; // Reached the end of the string. 1464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos=length; 1468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, stringsLength=strings.size(); 1469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t *spanBackUTF8Lengths=spanLengths; 1470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(all) { 1471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) spanBackUTF8Lengths+=3*stringsLength; 1472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 1474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Span until we find a code point from the set, 1475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // or a code point that starts or ends some string. 1476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(pos==0) { 1478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Check whether the current code point is in the original set, 1482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // without the string starts and ends. 1483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(cpLength>0) { 1485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Try to match the strings at pos. 1489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const uint8_t *s8=utf8; 1490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length8; 1491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(i=0; i<stringsLength; ++i) { 1492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length8=utf8Lengths[i]; 1493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // ALL_CP_CONTAINED: Irrelevant string. 1494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; // There is a set element at pos. 1496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s8+=length8; 1498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The span(while not contained) ended on a string start/end which is 1501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // not in the original set. Skip this code point and continue. 1502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // cpLength<0 1503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos+=cpLength; 1504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } while(pos!=0); 1505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; // Reached the start of the string. 1506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_END 1509