1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  patternprops.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2011mar13
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "patternprops.h"
19
20U_NAMESPACE_BEGIN
21
22/*
23 * One byte per Latin-1 character.
24 * Bit 0 is set if either Pattern property is true,
25 * bit 1 if Pattern_Syntax is true,
26 * bit 2 if Pattern_White_Space is true.
27 * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
28 */
29static const uint8_t latin1[256]={
30    // WS: 9..D
31    0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
32    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33    // WS: 20  Syntax: 21..2F
34    5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
35    // Syntax: 3A..40
36    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
37    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38    // Syntax: 5B..5E
39    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
40    // Syntax: 60
41    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42    // Syntax: 7B..7E
43    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
44    // WS: 85
45    0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47    // Syntax: A1..A7, A9, AB, AC, AE
48    0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
49    // Syntax: B0, B1, B6, BB, BF
50    3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
51    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52    // Syntax: D7
53    0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
54    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55    // Syntax: F7
56    0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
57};
58
59/*
60 * One byte per 32 characters from U+2000..U+303F indexing into
61 * a small table of 32-bit data words.
62 * The first two data words are all-zeros and all-ones.
63 */
64static const uint8_t index2000[130]={
65    2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
66    0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
67    1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
68    1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
69    1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
70    1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
71    1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
72    1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
73    1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
74    1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
75    1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
76    1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
77    0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
78    0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
79    1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
80    0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
81    8, 9  // 3000..303F
82};
83
84/*
85 * One 32-bit integer per 32 characters. Ranges of all-false and all-true
86 * are mapped to the first two values, other ranges map to appropriate bit patterns.
87 */
88static const uint32_t syntax2000[]={
89    0,
90    0xffffffff,
91    0xffff0000,  // 2: 2010..201F
92    0x7fff00ff,  // 3: 2020..2027, 2030..203E
93    0x7feffffe,  // 4: 2041..2053, 2055..205E
94    0xffff0000,  // 5: 2190..219F
95    0x003fffff,  // 6: 2760..2775
96    0xfff00000,  // 7: 2794..279F
97    0xffffff0e,  // 8: 3001..3003, 3008..301F
98    0x00010001   // 9: 3020, 3030
99};
100
101/*
102 * Same as syntax2000, but with additional bits set for the
103 * Pattern_White_Space characters 200E 200F 2028 2029.
104 */
105static const uint32_t syntaxOrWhiteSpace2000[]={
106    0,
107    0xffffffff,
108    0xffffc000,  // 2: 200E..201F
109    0x7fff03ff,  // 3: 2020..2029, 2030..203E
110    0x7feffffe,  // 4: 2041..2053, 2055..205E
111    0xffff0000,  // 5: 2190..219F
112    0x003fffff,  // 6: 2760..2775
113    0xfff00000,  // 7: 2794..279F
114    0xffffff0e,  // 8: 3001..3003, 3008..301F
115    0x00010001   // 9: 3020, 3030
116};
117
118UBool
119PatternProps::isSyntax(UChar32 c) {
120    if(c<0) {
121        return FALSE;
122    } else if(c<=0xff) {
123        return (UBool)(latin1[c]>>1)&1;
124    } else if(c<0x2010) {
125        return FALSE;
126    } else if(c<=0x3030) {
127        uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]];
128        return (UBool)((bits>>(c&0x1f))&1);
129    } else if(0xfd3e<=c && c<=0xfe46) {
130        return c<=0xfd3f || 0xfe45<=c;
131    } else {
132        return FALSE;
133    }
134}
135
136UBool
137PatternProps::isSyntaxOrWhiteSpace(UChar32 c) {
138    if(c<0) {
139        return FALSE;
140    } else if(c<=0xff) {
141        return (UBool)(latin1[c]&1);
142    } else if(c<0x200e) {
143        return FALSE;
144    } else if(c<=0x3030) {
145        uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
146        return (UBool)((bits>>(c&0x1f))&1);
147    } else if(0xfd3e<=c && c<=0xfe46) {
148        return c<=0xfd3f || 0xfe45<=c;
149    } else {
150        return FALSE;
151    }
152}
153
154UBool
155PatternProps::isWhiteSpace(UChar32 c) {
156    if(c<0) {
157        return FALSE;
158    } else if(c<=0xff) {
159        return (UBool)(latin1[c]>>2)&1;
160    } else if(0x200e<=c && c<=0x2029) {
161        return c<=0x200f || 0x2028<=c;
162    } else {
163        return FALSE;
164    }
165}
166
167const UChar *
168PatternProps::skipWhiteSpace(const UChar *s, int32_t length) {
169    while(length>0 && isWhiteSpace(*s)) {
170        ++s;
171        --length;
172    }
173    return s;
174}
175
176const UChar *
177PatternProps::trimWhiteSpace(const UChar *s, int32_t &length) {
178    if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) {
179        return s;
180    }
181    int32_t start=0;
182    int32_t limit=length;
183    while(start<limit && isWhiteSpace(s[start])) {
184        ++start;
185    }
186    if(start<limit) {
187        // There is non-white space at start; we will not move limit below that,
188        // so we need not test start<limit in the loop.
189        while(isWhiteSpace(s[limit-1])) {
190            --limit;
191        }
192    }
193    length=limit-start;
194    return s+start;
195}
196
197UBool
198PatternProps::isIdentifier(const UChar *s, int32_t length) {
199    if(length<=0) {
200        return FALSE;
201    }
202    const UChar *limit=s+length;
203    do {
204        if(isSyntaxOrWhiteSpace(*s++)) {
205            return FALSE;
206        }
207    } while(s<limit);
208    return TRUE;
209}
210
211const UChar *
212PatternProps::skipIdentifier(const UChar *s, int32_t length) {
213    while(length>0 && !isSyntaxOrWhiteSpace(*s)) {
214        ++s;
215        --length;
216    }
217    return s;
218}
219
220U_NAMESPACE_END
221