16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org****************************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2007-2012, International Business Machines 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Corporation and others. All Rights Reserved. 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org****************************************************************************** 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* file name: unisetspan.cpp 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* encoding: US-ASCII 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* tab size: 8 (not used) 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* indentation:4 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created on: 2007mar01 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created by: Markus W. Scherer 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ustring.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf8.h" 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h" 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h" 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uvector.h" 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unisetspan.h" 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * List of offsets from the current position from where to try matching 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * a code point or a string. 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Store offsets rather than indexes to simplify the code and use the same list 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * for both increments (in span()) and decrements (in spanBack()). 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Assumption: The maximum offset is limited, and the offsets that are stored 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * at any one time are relatively dense, that is, there are normally no gaps of 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * hundreds or thousands of offset values. 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The implementation uses a circular buffer of byte flags, 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * each indicating whether the corresponding offset is in the list. 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This avoids inserting into a sorted list of offsets (or absolute indexes) and 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * physically moving part of the list. 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Note: In principle, the caller should setMaxLength() to the maximum of the 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * max string length and U16_LENGTH/U8_LENGTH to account for 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * "long" single code points. 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * However, this implementation uses at least a staticList with more than 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * U8_LENGTH entries anyway. 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Note: If maxLength were guaranteed to be no more than 32 or 64, 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the list could be stored as bit flags in a single integer. 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Rather than handling a circular buffer with a start list index, 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the integer would simply be shifted when lower offsets are removed. 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * UnicodeSet does not have a limit on the lengths of strings. 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ~OffsetList() { 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(list!=staticList) { 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(list); 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Call exactly once if the list is to be used. 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void setMaxLength(int32_t maxLength) { 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(maxLength<=(int32_t)sizeof(staticList)) { 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org capacity=(int32_t)sizeof(staticList); 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool *l=(UBool *)uprv_malloc(maxLength); 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(l!=NULL) { 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list=l; 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org capacity=maxLength; 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memset(list, 0, capacity); 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void clear() { 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memset(list, 0, capacity); 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=length=0; 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isEmpty() const { 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (UBool)(length==0); 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Reduce all stored offsets by delta, used when the current position 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // moves by delta. 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // There must not be any offsets lower than delta. 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If there is an offset equal to delta, it is removed. 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // delta=[1..maxLength] 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void shift(int32_t delta) { 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=start+delta; 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(i>=capacity) { 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i-=capacity; 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(list[i]) { 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list[i]=FALSE; 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --length; 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=i; 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Add an offset. The list must not contain it yet. 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // offset=[1..maxLength] 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void addOffset(int32_t offset) { 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=start+offset; 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(i>=capacity) { 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i-=capacity; 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list[i]=TRUE; 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++length; 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // offset=[1..maxLength] 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool containsOffset(int32_t offset) const { 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=start+offset; 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(i>=capacity) { 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i-=capacity; 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return list[i]; 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Find the lowest stored offset from a non-empty list, remove it, 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and reduce all other offsets by this minimum. 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Returns [1..maxLength]. 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t popMinimum() { 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for the next offset in list[start+1..capacity-1]. 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=start, result; 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(++i<capacity) { 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(list[i]) { 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list[i]=FALSE; 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --length; 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result=i-start; 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=i; 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return result; 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // i==capacity 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Wrap around and look for the next offset in list[0..start]. 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Since the list is not empty, there will be one. 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result=capacity-start; 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i=0; 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(!list[i]) { 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++i; 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list[i]=FALSE; 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --length; 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=i; 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return result+=i; 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprivate: 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool *list; 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t capacity; 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length; 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start; 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool staticList[16]; 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Get the number of UTF-8 bytes for a UTF-16 (sub)string. 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orggetUTF8Length(const UChar *s, int32_t length) { 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode errorCode=U_ZERO_ERROR; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8=0; 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length8; 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The string contains an unpaired surrogate. 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Ignore this string. 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgappendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode errorCode=U_ZERO_ERROR; 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8=0; 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_SUCCESS(errorCode)) { 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length8; 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The string contains an unpaired surrogate. 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Ignore this string. 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline uint8_t 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgmakeSpanLengthByte(int32_t spanLength) { 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 0xfe==UnicodeSetStringSpan::LONG_SPAN 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Construct for all variants of span(), or only for any one variant. 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Initialize as little as possible, for single use. 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UVector &setStrings, 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t which) 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Length(0), 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength16(0), maxLength8(0), 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org all((UBool)(which==ALL)) { 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanSet.retainAll(set); 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&NOT_CONTAINED) { 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Default to the same sets. 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // addToSpanNotSet() will create a separate set if necessary. 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pSpanNotSet=&spanSet; 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Determine if the strings even need to be taken into account at all for span() etc. 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If any string is relevant, then all strings need to be used for 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // span(longest match) but only the relevant ones for span(while contained). 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t stringsLength=strings.size(); 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, spanLength; 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool someRelevant=FALSE; 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool thisRelevant; 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength<length16) { // Relevant string. 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org someRelevant=thisRelevant=TRUE; 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org thisRelevant=FALSE; 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((which&UTF16) && length16>maxLength16) { 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength16=length16; 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8=getUTF8Length(s16, length16); 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Length+=length8; 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8>maxLength8) { 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength8=length8; 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!someRelevant) { 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength16=maxLength8=0; 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Freeze after checking for the need to use strings at all because freezing 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // a set takes some time and memory which are wasted if there are no relevant strings. 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanSet.freeze(); 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanBackLengths; 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanUTF8Lengths; 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanBackUTF8Lengths; 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Allocate a block of meta data. 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t allocSize; 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org allocSize=stringsLength; // One set of span lengths. 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&UTF8) { 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // UTF-8 lengths and UTF-8 strings. 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org allocSize+=stringsLength*4+utf8Length; 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(allocSize<=(int32_t)sizeof(staticLengths)) { 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths=staticLengths; 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths=(int32_t *)uprv_malloc(allocSize); 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(utf8Lengths==NULL) { 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; // Out of memory. 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Store span lengths for all span() variants. 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackLengths=spanLengths+stringsLength; 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths=spanBackLengths+stringsLength; 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8=spanBackUTF8Lengths+stringsLength; 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Store span lengths for only one span() variant. 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&UTF8) { 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8=spanLengths+stringsLength; 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths=(uint8_t *)utf8Lengths; 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Set the meta data and pSpanNotSet and write the UTF-8 strings. 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t utf8Count=0; // Count UTF-8 bytes written so far. 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength<length16) { // Relevant string. 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&UTF16) { 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&CONTAINED) { 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&FWD) { 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths[i]=makeSpanLengthByte(spanLength); 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&BACK) { 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackLengths[i]=makeSpanLengthByte(spanLength); 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&UTF8) { 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *s8=utf8+utf8Count; 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Count+=utf8Lengths[i]=length8; 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { // Relevant for UTF-8. 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&CONTAINED) { 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&FWD) { 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&BACK) { 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&NOT_CONTAINED) { 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Add string start and end code points to the spanNotSet so that 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // a span(while not contained) stops before any string. 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c; 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&FWD) { 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len=0; 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_NEXT(s16, len, length16, c); 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org addToSpanNotSet(c); 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&BACK) { 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len=length16; 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_PREV(s16, 0, len, c); 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org addToSpanNotSet(c); 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { // Irrelevant string. 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&UTF8) { 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *s8=utf8+utf8Count; 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Count+=utf8Lengths[i]=length8; 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths[i]=0; 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths[i]=spanBackLengths[i]= 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (uint8_t)ALL_CP_CONTAINED; 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All spanXYZLengths pointers contain the same address. 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Finish. 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pSpanNotSet->freeze(); 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Copy constructor. Assumes which==ALL for a frozen set. 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UVector &newParentSetStrings) 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Length(otherStringSpan.utf8Length), 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org all(TRUE) { 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pSpanNotSet=&spanSet; 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Allocate a block of meta data. 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t stringsLength=strings.size(); 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(allocSize<=(int32_t)sizeof(staticLengths)) { 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths=staticLengths; 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8Lengths=(int32_t *)uprv_malloc(allocSize); 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(utf8Lengths==NULL) { 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; // Out of memory. 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utf8=spanLengths+stringsLength*4; 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUnicodeSetStringSpan::~UnicodeSetStringSpan() { 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete pSpanNotSet; 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(utf8Lengths); 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanSet.contains(c)) { 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; // Nothing to do. 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(newSet==NULL) { 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; // Out of memory. 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pSpanNotSet=newSet; 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pSpanNotSet->add(c); 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Compare strings without any argument checks. Requires length>0. 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline UBool 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgmatches16(const UChar *s, const UChar *t, int32_t length) { 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(*s++!=*t++) { 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return FALSE; 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(--length>0); 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return TRUE; 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline UBool 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgmatches8(const uint8_t *s, const uint8_t *t, int32_t length) { 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(*s++!=*t++) { 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return FALSE; 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(--length>0); 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return TRUE; 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Compare 16-bit Unicode strings (which may be malformed UTF-16) 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// at code point boundaries. 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// That is, each edge of a match must not be in the middle of a surrogate pair. 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline UBool 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgmatches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s+=start; 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit-=start; 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return matches16(s, t, length) && 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Does the set contain the next code point? 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// If so, return its length; otherwise return its negative length. 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgspanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar c=*s, c2; 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(c) ? 1 : -1; 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgspanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar c=s[length-1], c2; 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(c) ? 1 : -1; 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgspanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=*s; 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((int8_t)c>=0) { 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(c) ? 1 : -1; 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=0; 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U8_NEXT_OR_FFFD(s, i, length, c); 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(c) ? i : -i; 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgspanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=s[length-1]; 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((int8_t)c>=0) { 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(c) ? 1 : -1; 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=length-1; 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length-=i; 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return set.contains(c) ? length : -length; 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Note: In span() when spanLength==0 (after a string match, or at the beginning 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * after an empty code point span) and in spanNot() and spanNotUTF8(), 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * string matching could use a binary search 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * because all string matches are done from the same start index. 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * For UTF-8, this would require a comparison function that returns UTF-16 order. 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This optimization should not be necessary for normal UnicodeSets because 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * most sets have no strings, and most sets with strings have 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * very few very short strings. 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * For cases with many strings, it might be better to use a different API 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and implementation with a DFA (state machine). 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Algorithm for span(USET_SPAN_CONTAINED) 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Theoretical algorithm: 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Iterate through the string, and at each code point boundary: 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If the code point there is in the set, then remember to continue after it. 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If a set string matches at the current position, then remember to continue after it. 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + Either recursively span for each code point or string match, 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * or recursively span for all but the shortest one and 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * iteratively continue the span with the shortest local match. 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + Remember the longest recursive span (the farthest end point). 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If there is no match at the current position, neither for the code point there 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * nor for any set string, then stop and return the longest recursive span length. 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Optimized implementation: 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (We assume that most sets will have very few very short strings. 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * A span using a string-less set is extremely fast.) 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Create and cache a spanSet which contains all of the single code points 5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * of the original set but none of its strings. 5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Loop: 5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + Try to match each set string at the end of the spanLength. 5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ~ Set strings that start with set-contained code points must be matched 5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * with a partial overlap because the recursive algorithm would have tried 5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * to match them at every position. 5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ~ Set strings that entirely consist of set-contained code points 5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * are irrelevant for span(USET_SPAN_CONTAINED) because the 5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * recursive algorithm would continue after them anyway 5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and find the longest recursive match from their end. 5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ~ Rather than recursing, note each end point of a set string match. 5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If no set string matched after spanSet.span(), then return 5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * with where the spanSet.span() ended. 5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If at least one set string matched after spanSet.span(), then 5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * pop the shortest string match end point and continue 5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the loop, trying to match all set strings from there. 5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If at least one more set string matched after a previous string match, 5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then test if the code point after the previous string match is also 5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * contained in the set. 5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Continue the loop with the shortest end point of either this code point 5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * or a matching set string. 5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If no more set string matched after a previous string match, 5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Stop if spanLength==0, otherwise continue the loop. 5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * By noting each end point of a set string match, 5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the function visits each string position at most once and finishes 5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * in linear time. 5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The recursive algorithm may visit the same string position many times 5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * if multiple paths lead to it and finishes in exponential time. 5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Algorithm for span(USET_SPAN_SIMPLE) 5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Theoretical algorithm: 5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Iterate through the string, and at each code point boundary: 5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If the code point there is in the set, then remember to continue after it. 6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If a set string matches at the current position, then remember to continue after it. 6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + Continue from the farthest match position and ignore all others. 6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If there is no match at the current position, 6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then stop and return the current position. 6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Optimized implementation: 6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (Same assumption and spanSet as above.) 6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Loop: 6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + Try to match each set string at the end of the spanLength. 6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ~ Set strings that start with set-contained code points must be matched 6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * with a partial overlap because the standard algorithm would have tried 6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * to match them earlier. 6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ~ Set strings that entirely consist of set-contained code points 6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * must be matched with a full overlap because the longest-match algorithm 6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * would hide set string matches that end earlier. 6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Such set strings need not be matched earlier inside the code point span 6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * because the standard algorithm would then have continued after 6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the set string match anyway. 6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ~ Remember the longest set string match (farthest end point) from the earliest 6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * starting point. 6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If no set string matched after spanSet.span(), then return 6246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * with where the spanSet.span() ended. 6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If at least one set string matched, then continue the loop after the 6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * longest match from the earliest position. 6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If no more set string matched after a previous string match, 6286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Stop if spanLength==0, otherwise continue the loop. 6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_NOT_CONTAINED) { 6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return spanNot(s, length); 6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength==length) { 6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; 6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Consider strings; they may overlap with the span. 6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org OffsetList offsets; 6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Use offset list to try all possibilities. 6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.setMaxLength(maxLength16); 6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=spanLength, rest=length-pos; 6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanLengths[i]; 6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==ALL_CP_CONTAINED) { 6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Irrelevant string. 6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-overlap..pos. 6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length16; 6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // While contained: No point matching fully inside the code point span. 6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t inc=length16-overlap; // Keep overlap+inc==length16. 6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inc>rest) { 6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the increment is not listed already. 6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inc==rest) { 6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.addOffset(inc); 6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==0) { 6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++inc; 6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* USET_SPAN_SIMPLE */ { 6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t maxInc=0, maxOverlap=0; 6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanLengths[i]; 6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // For longest match, we do need to try to match even an all-contained string 6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the earliest start. 6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-overlap..pos. 7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length16; 7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest match: Need to match fully inside the code point span 7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the earliest start. 7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t inc=length16-overlap; // Keep overlap+inc==length16. 7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inc>rest || overlap<maxOverlap) { 7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the string is longer or starts earlier. 7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matches16CPB(s, pos-overlap, length, s16, length16) 7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxInc=inc; // Longest match from earliest start. 7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxOverlap=overlap; 7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 7206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 7226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++inc; 7236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(maxInc!=0 || maxOverlap!=0) { 7276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest-match algorithm, and there was a string match. 7286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Simply continue after it. 7296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=maxInc; 7306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=maxInc; 7316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(rest==0) { 7326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 7336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from after a string match. 7356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 7366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Finished trying to match all strings at pos. 7396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength!=0 || pos==0) { 7416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is after an unlimited code point span (spanLength!=0), 7426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not after a string match. 7436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The only position where spanLength==0 after a span is pos==0. 7446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Otherwise, an unlimited code point span is only tried again when no 7456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // strings match, and if such a non-initial span fails we stop. 7466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 7476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // No strings matched after a span. 7486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from after the next string match. 7506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 7516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is after a string match (or a single code point). 7526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 7536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // No more strings matched after a previous string match. 7546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try another code point span from after the last string match. 7556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 7566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( spanLength==rest || // Reached the end of the string, or 7576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength==0 // neither strings nor span progressed. 7586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 7596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos+spanLength; 7606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=spanLength; 7626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=spanLength; 7636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // spanLength>0: Match strings from after a span. 7646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 7656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match only one code point from after a string match if some 7666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // string matched beyond it, so that we try all possible positions 7676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and don't overshoot. 7686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanOne(spanSet, s+pos, rest); 7696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength>0) { 7706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength==rest) { 7716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 7726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings after this code point. 7746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // There cannot be any increments below it because UnicodeSet strings 7756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // contain multiple code points. 7766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=spanLength; 7776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=spanLength; 7786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.shift(spanLength); 7796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; 7806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Match strings from after a single code point. 7816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from after the next string match. 7836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t minOffset=offsets.popMinimum(); 7866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=minOffset; 7876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=minOffset; 7886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from after a string match. 7896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 7916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 7936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_NOT_CONTAINED) { 7946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return spanNotBack(s, length); 7956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 7976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pos==0) { 7986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 7996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t spanLength=length-pos; 8016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Consider strings; they may overlap with the span. 8036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org OffsetList offsets; 8046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 8056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Use offset list to try all possibilities. 8066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.setMaxLength(maxLength16); 8076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 8096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanBackLengths=spanLengths; 8106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 8116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackLengths+=stringsLength; 8126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 8146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 8156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 8166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanBackLengths[i]; 8176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==ALL_CP_CONTAINED) { 8186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Irrelevant string. 8196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 8216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 8226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 8236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-(length16-overlap)..pos-length16. 8256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 8266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length16; 8276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // While contained: No point matching fully inside the code point span. 8286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len1=0; 8296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U16_FWD_1(s16, len1, overlap); 8306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap-=len1; // Length of the string minus the first code point. 8316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 8336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 8346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t dec=length16-overlap; // Keep dec+overlap==length16. 8366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 8376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(dec>pos) { 8386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 8396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the decrement is not listed already. 8416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 8426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(dec==pos) { 8436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 8446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.addOffset(dec); 8466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==0) { 8486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 8496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 8516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++dec; 8526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* USET_SPAN_SIMPLE */ { 8556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t maxDec=0, maxOverlap=0; 8566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 8576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanBackLengths[i]; 8586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // For longest match, we do need to try to match even an all-contained string 8596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the latest end. 8606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 8626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 8636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 8646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-(length16-overlap)..pos-length16. 8666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 8676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length16; 8686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest match: Need to match fully inside the code point span 8696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the latest end. 8706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 8726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 8736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t dec=length16-overlap; // Keep dec+overlap==length16. 8756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 8766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(dec>pos || overlap<maxOverlap) { 8776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 8786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the string is longer or ends later. 8806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 8816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matches16CPB(s, pos-dec, length, s16, length16) 8826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 8836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxDec=dec; // Longest match from latest end. 8846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxOverlap=overlap; 8856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 8866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 8886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++dec; 8896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(maxDec!=0 || maxOverlap!=0) { 8936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest-match algorithm, and there was a string match. 8946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Simply continue before it. 8956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=maxDec; 8966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pos==0) { 8976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 8986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from before a string match. 9006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 9016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Finished trying to match all strings at pos. 9046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength!=0 || pos==length) { 9066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is before an unlimited code point span (spanLength!=0), 9076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not before a string match. 9086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The only position where spanLength==0 before a span is pos==length. 9096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Otherwise, an unlimited code point span is only tried again when no 9106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // strings match, and if such a non-initial span fails we stop. 9116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 9126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // No strings matched before a span. 9136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from before the next string match. 9156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 9166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is before a string match (or a single code point). 9176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 9186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // No more strings matched before a previous string match. 9196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try another code point span from before the last string match. 9206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t oldPos=pos; 9216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 9226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=oldPos-pos; 9236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( pos==0 || // Reached the start of the string, or 9246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength==0 // neither strings nor span progressed. 9256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 9266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; 9276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // spanLength>0: Match strings from before a span. 9296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 9306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match only one code point from before a string match if some 9316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // string matched beyond it, so that we try all possible positions 9326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and don't overshoot. 9336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanOneBack(spanSet, s, pos); 9346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength>0) { 9356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength==pos) { 9366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 9376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings before this code point. 9396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // There cannot be any decrements below it because UnicodeSet strings 9406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // contain multiple code points. 9416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=spanLength; 9426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.shift(spanLength); 9436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; 9446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Match strings from before a single code point. 9456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from before the next string match. 9476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=offsets.popMinimum(); 9506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from before a string match. 9516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 9536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 9556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_NOT_CONTAINED) { 9566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return spanNotUTF8(s, length); 9576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 9596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength==length) { 9606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; 9616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Consider strings; they may overlap with the span. 9646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org OffsetList offsets; 9656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 9666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Use offset list to try all possibilities. 9676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.setMaxLength(maxLength8); 9686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=spanLength, rest=length-pos; 9706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 9716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanUTF8Lengths=spanLengths; 9726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 9736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths+=2*stringsLength; 9746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 9766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *s8=utf8; 9776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8; 9786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 9796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 9806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length8=utf8Lengths[i]; 9816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8==0) { 9826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // String not representable in UTF-8. 9836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanUTF8Lengths[i]; 9856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==ALL_CP_CONTAINED) { 9866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 9876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Irrelevant string. 9886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-overlap..pos. 9916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 9926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length8; 9936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // While contained: No point matching fully inside the code point span. 9946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 9956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 9976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 9986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t inc=length8-overlap; // Keep overlap+inc==length8. 10006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 10016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inc>rest) { 10026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 10036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the increment is not listed already. 10056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match at code point boundaries. (The UTF-8 strings were converted 10066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // from UTF-16 and are guaranteed to be well-formed.) 10076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( !U8_IS_TRAIL(s[pos-overlap]) && 10086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org !offsets.containsOffset(inc) && 10096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matches8(s+pos-overlap, s8, length8) 10106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 10126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inc==rest) { 10136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 10146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.addOffset(inc); 10166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==0) { 10186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 10196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 10216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++inc; 10226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 10246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* USET_SPAN_SIMPLE */ { 10266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t maxInc=0, maxOverlap=0; 10276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 10286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length8=utf8Lengths[i]; 10296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8==0) { 10306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // String not representable in UTF-8. 10316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanUTF8Lengths[i]; 10336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // For longest match, we do need to try to match even an all-contained string 10346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the earliest start. 10356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-overlap..pos. 10376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 10386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length8; 10396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest match: Need to match fully inside the code point span 10406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the earliest start. 10416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 10436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 10446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t inc=length8-overlap; // Keep overlap+inc==length8. 10466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 10476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inc>rest || overlap<maxOverlap) { 10486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 10496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the string is longer or starts earlier. 10516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match at code point boundaries. (The UTF-8 strings were converted 10526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // from UTF-16 and are guaranteed to be well-formed.) 10536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( !U8_IS_TRAIL(s[pos-overlap]) && 10546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 10556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matches8(s+pos-overlap, s8, length8) 10566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 10586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxInc=inc; // Longest match from earliest start. 10596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxOverlap=overlap; 10606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 10616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 10636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++inc; 10646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 10666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(maxInc!=0 || maxOverlap!=0) { 10696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest-match algorithm, and there was a string match. 10706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Simply continue after it. 10716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=maxInc; 10726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=maxInc; 10736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(rest==0) { 10746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 10756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from after a string match. 10776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 10786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Finished trying to match all strings at pos. 10816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength!=0 || pos==0) { 10836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is after an unlimited code point span (spanLength!=0), 10846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not after a string match. 10856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The only position where spanLength==0 after a span is pos==0. 10866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Otherwise, an unlimited code point span is only tried again when no 10876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // strings match, and if such a non-initial span fails we stop. 10886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 10896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // No strings matched after a span. 10906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from after the next string match. 10926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 10936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is after a string match (or a single code point). 10946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 10956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // No more strings matched after a previous string match. 10966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try another code point span from after the last string match. 10976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 10986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( spanLength==rest || // Reached the end of the string, or 10996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength==0 // neither strings nor span progressed. 11006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 11016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos+spanLength; 11026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=spanLength; 11046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=spanLength; 11056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // spanLength>0: Match strings from after a span. 11066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 11076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match only one code point from after a string match if some 11086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // string matched beyond it, so that we try all possible positions 11096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and don't overshoot. 11106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanOneUTF8(spanSet, s+pos, rest); 11116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength>0) { 11126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength==rest) { 11136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 11146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings after this code point. 11166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // There cannot be any increments below it because UnicodeSet strings 11176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // contain multiple code points. 11186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=spanLength; 11196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=spanLength; 11206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.shift(spanLength); 11216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; 11226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Match strings from after a single code point. 11236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from after the next string match. 11256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t minOffset=offsets.popMinimum(); 11286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=minOffset; 11296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=minOffset; 11306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from after a string match. 11316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 11336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 11356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_NOT_CONTAINED) { 11366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return spanNotBackUTF8(s, length); 11376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 11396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pos==0) { 11406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 11416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t spanLength=length-pos; 11436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Consider strings; they may overlap with the span. 11456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org OffsetList offsets; 11466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 11476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Use offset list to try all possibilities. 11486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.setMaxLength(maxLength8); 11496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 11516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanBackUTF8Lengths=spanLengths; 11526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 11536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackUTF8Lengths+=3*stringsLength; 11546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 11566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *s8=utf8; 11576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8; 11586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition==USET_SPAN_CONTAINED) { 11596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 11606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length8=utf8Lengths[i]; 11616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8==0) { 11626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // String not representable in UTF-8. 11636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanBackUTF8Lengths[i]; 11656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==ALL_CP_CONTAINED) { 11666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 11676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Irrelevant string. 11686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-(length8-overlap)..pos-length8. 11716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 11726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length8; 11736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // While contained: No point matching fully inside the code point span. 11746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len1=0; 11756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U8_FWD_1(s8, len1, overlap); 11766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap-=len1; // Length of the string minus the first code point. 11776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 11796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 11806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t dec=length8-overlap; // Keep dec+overlap==length8. 11826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 11836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(dec>pos) { 11846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 11856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the decrement is not listed already. 11876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match at code point boundaries. (The UTF-8 strings were converted 11886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // from UTF-16 and are guaranteed to be well-formed.) 11896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( !U8_IS_TRAIL(s[pos-dec]) && 11906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org !offsets.containsOffset(dec) && 11916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matches8(s+pos-dec, s8, length8) 11926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 11936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(dec==pos) { 11946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 11956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.addOffset(dec); 11976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap==0) { 11996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 12006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 12026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++dec; 12036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 12056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* USET_SPAN_SIMPLE */ { 12076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t maxDec=0, maxOverlap=0; 12086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 12096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length8=utf8Lengths[i]; 12106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8==0) { 12116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // String not representable in UTF-8. 12126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t overlap=spanBackUTF8Lengths[i]; 12146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // For longest match, we do need to try to match even an all-contained string 12156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the latest end. 12166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 12176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match this string at pos-(length8-overlap)..pos-length8. 12186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>=LONG_SPAN) { 12196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=length8; 12206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest match: Need to match fully inside the code point span 12216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to find the match from the latest end. 12226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(overlap>spanLength) { 12246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overlap=spanLength; 12256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t dec=length8-overlap; // Keep dec+overlap==length8. 12276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 12286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(dec>pos || overlap<maxOverlap) { 12296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 12306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match if the string is longer or ends later. 12326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match at code point boundaries. (The UTF-8 strings were converted 12336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // from UTF-16 and are guaranteed to be well-formed.) 12346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( !U8_IS_TRAIL(s[pos-dec]) && 12356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 12366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org matches8(s+pos-dec, s8, length8) 12376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 12386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxDec=dec; // Longest match from latest end. 12396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org maxOverlap=overlap; 12406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 12416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --overlap; 12436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++dec; 12446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 12466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 12486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(maxDec!=0 || maxOverlap!=0) { 12496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Longest-match algorithm, and there was a string match. 12506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Simply continue before it. 12516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=maxDec; 12526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pos==0) { 12536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 12546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from before a string match. 12566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 12576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Finished trying to match all strings at pos. 12606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 12616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength!=0 || pos==length) { 12626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is before an unlimited code point span (spanLength!=0), 12636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not before a string match. 12646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The only position where spanLength==0 before a span is pos==length. 12656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Otherwise, an unlimited code point span is only tried again when no 12666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // strings match, and if such a non-initial span fails we stop. 12676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 12686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // No strings matched before a span. 12696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from before the next string match. 12716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 12726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The position is before a string match (or a single code point). 12736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(offsets.isEmpty()) { 12746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // No more strings matched before a previous string match. 12756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try another code point span from before the last string match. 12766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t oldPos=pos; 12776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 12786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=oldPos-pos; 12796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( pos==0 || // Reached the start of the string, or 12806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength==0 // neither strings nor span progressed. 12816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 12826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; 12836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // spanLength>0: Match strings from before a span. 12856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 12866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match only one code point from before a string match if some 12876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // string matched beyond it, so that we try all possible positions 12886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and don't overshoot. 12896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=spanOneBackUTF8(spanSet, s, pos); 12906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength>0) { 12916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLength==pos) { 12926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 12936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 12946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings before this code point. 12956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // There cannot be any decrements below it because UnicodeSet strings 12966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // contain multiple code points. 12976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=spanLength; 12986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.shift(spanLength); 12996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; 13006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Match strings from before a single code point. 13016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Match strings from before the next string match. 13036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=offsets.popMinimum(); 13066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanLength=0; // Match strings from before a string match. 13076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 13096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 13116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 13126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 13136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Theoretical algorithm: 13146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Iterate through the string, and at each code point boundary: 13156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If the code point there is in the set, then return with the current position. 13166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If a set string matches at the current position, then return with the current position. 13176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 13186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Optimized implementation: 13196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 13206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (Same assumption as for span() above.) 13216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 13226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Create and cache a spanNotSet which contains all of the single code points 13236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * of the original set but none of its strings. 13246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * For each set string add its initial code point to the spanNotSet. 13256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (Also add its final code point for spanNotBack().) 13266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 13276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * - Loop: 13286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 13296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If the current code point is in the original set, then 13306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * return the current position. 13316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If any set string matches at the current position, then 13326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * return the current position. 13336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * + If there is no match at the current position, neither for the code point there 13346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * nor for any set string, then skip this code point and continue the loop. 13356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This happens for set-string-initial code points that were added to spanNotSet 13366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * when there is not actually a match for such a set string. 13376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 13386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 13406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=0, rest=length; 13416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 13426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 13436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Span until we find a code point from the set, 13446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // or a code point that starts or ends some string. 13456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 13466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(i==rest) { 13476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 13486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=i; 13506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=i; 13516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Check whether the current code point is in the original set, 13536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // without the string starts and ends. 13546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cpLength=spanOne(spanSet, s+pos, rest); 13556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cpLength>0) { 13566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 13576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match the strings at pos. 13606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 13616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLengths[i]==ALL_CP_CONTAINED) { 13626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Irrelevant string. 13636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 13656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 13666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 13676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 13686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 13696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The span(while not contained) ended on a string start/end which is 13736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not in the original set. Skip this code point and continue. 13746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // cpLength<0 13756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=cpLength; 13766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest+=cpLength; 13776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(rest!=0); 13786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 13796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 13806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 13826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=length; 13836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 13846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 13856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Span until we find a code point from the set, 13866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // or a code point that starts or ends some string. 13876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 13886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pos==0) { 13896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 13906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Check whether the current code point is in the original set, 13936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // without the string starts and ends. 13946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cpLength=spanOneBack(spanSet, s, pos); 13956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cpLength>0) { 13966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 13976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 13986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 13996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match the strings at pos. 14006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 14016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Use spanLengths rather than a spanBackLengths pointer because 14026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // it is easier and we only need to know whether the string is irrelevant 14036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // which is the same in either array. 14046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanLengths[i]==ALL_CP_CONTAINED) { 14056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; // Irrelevant string. 14066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 14086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *s16=string.getBuffer(); 14096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length16=string.length(); 14106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 14116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 14126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The span(while not contained) ended on a string start/end which is 14166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not in the original set. Skip this code point and continue. 14176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // cpLength<0 14186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=cpLength; 14196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(pos!=0); 14206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 14216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 14226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 14246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=0, rest=length; 14256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 14266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanUTF8Lengths=spanLengths; 14276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 14286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanUTF8Lengths+=2*stringsLength; 14296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 14316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Span until we find a code point from the set, 14326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // or a code point that starts or ends some string. 14336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 14346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(i==rest) { 14356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 14366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=i; 14386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest-=i; 14396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Check whether the current code point is in the original set, 14416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // without the string starts and ends. 14426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 14436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cpLength>0) { 14446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 14456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match the strings at pos. 14486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *s8=utf8; 14496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8; 14506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 14516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length8=utf8Lengths[i]; 14526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ALL_CP_CONTAINED: Irrelevant string. 14536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 14546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 14556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 14576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The span(while not contained) ended on a string start/end which is 14606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not in the original set. Skip this code point and continue. 14616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // cpLength<0 14626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos-=cpLength; 14636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rest+=cpLength; 14646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(rest!=0); 14656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length; // Reached the end of the string. 14666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 14676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 14696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pos=length; 14706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i, stringsLength=strings.size(); 14716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *spanBackUTF8Lengths=spanLengths; 14726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(all) { 14736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanBackUTF8Lengths+=3*stringsLength; 14746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 14766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Span until we find a code point from the set, 14776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // or a code point that starts or ends some string. 14786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 14796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(pos==0) { 14806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 14816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Check whether the current code point is in the original set, 14846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // without the string starts and ends. 14856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 14866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cpLength>0) { 14876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 14886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 14906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Try to match the strings at pos. 14916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *s8=utf8; 14926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length8; 14936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<stringsLength; ++i) { 14946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length8=utf8Lengths[i]; 14956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ALL_CP_CONTAINED: Irrelevant string. 14966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 14976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; // There is a set element at pos. 14986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 14996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s8+=length8; 15006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 15016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 15026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The span(while not contained) ended on a string start/end which is 15036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // not in the original set. Skip this code point and continue. 15046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // cpLength<0 15056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=cpLength; 15066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(pos!=0); 15076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Reached the start of the string. 15086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 15096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 15106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 1511