1/*
2*******************************************************************************
3*   Copyright (C) 2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  patternprops.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2011mar13
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "patternprops.h"
17
18U_NAMESPACE_BEGIN
19
20/*
21 * One byte per Latin-1 character.
22 * Bit 0 is set if either Pattern property is true,
23 * bit 1 if Pattern_Syntax is true,
24 * bit 2 if Pattern_White_Space is true.
25 * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
26 */
27static const uint8_t latin1[256]={
28    // WS: 9..D
29    0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
30    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31    // WS: 20  Syntax: 21..2F
32    5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
33    // Syntax: 3A..40
34    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
35    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
36    // Syntax: 5B..5E
37    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
38    // Syntax: 60
39    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40    // Syntax: 7B..7E
41    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
42    // WS: 85
43    0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45    // Syntax: A1..A7, A9, AB, AC, AE
46    0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
47    // Syntax: B0, B1, B6, BB, BF
48    3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
49    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50    // Syntax: D7
51    0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
52    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53    // Syntax: F7
54    0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
55};
56
57/*
58 * One byte per 32 characters from U+2000..U+303F indexing into
59 * a small table of 32-bit data words.
60 * The first two data words are all-zeros and all-ones.
61 */
62static const uint8_t index2000[130]={
63    2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
64    0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
65    1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
66    1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
67    1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
68    1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
69    1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
70    1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
71    1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
72    1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
73    1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
74    1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
75    0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
76    0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
77    1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
78    0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
79    8, 9  // 3000..303F
80};
81
82/*
83 * One 32-bit integer per 32 characters. Ranges of all-false and all-true
84 * are mapped to the first two values, other ranges map to appropriate bit patterns.
85 */
86static const uint32_t syntax2000[]={
87    0,
88    0xffffffff,
89    0xffff0000,  // 2: 2010..201F
90    0x7fff00ff,  // 3: 2020..2027, 2030..203E
91    0x7feffffe,  // 4: 2041..2053, 2055..205E
92    0xffff0000,  // 5: 2190..219F
93    0x003fffff,  // 6: 2760..2775
94    0xfff00000,  // 7: 2794..279F
95    0xffffff0e,  // 8: 3001..3003, 3008..301F
96    0x00010001   // 9: 3020, 3030
97};
98
99/*
100 * Same as syntax2000, but with additional bits set for the
101 * Pattern_White_Space characters 200E 200F 2028 2029.
102 */
103static const uint32_t syntaxOrWhiteSpace2000[]={
104    0,
105    0xffffffff,
106    0xffffc000,  // 2: 200E..201F
107    0x7fff03ff,  // 3: 2020..2029, 2030..203E
108    0x7feffffe,  // 4: 2041..2053, 2055..205E
109    0xffff0000,  // 5: 2190..219F
110    0x003fffff,  // 6: 2760..2775
111    0xfff00000,  // 7: 2794..279F
112    0xffffff0e,  // 8: 3001..3003, 3008..301F
113    0x00010001   // 9: 3020, 3030
114};
115
116UBool
117PatternProps::isSyntax(UChar32 c) {
118    if(c<0) {
119        return FALSE;
120    } else if(c<=0xff) {
121        return (UBool)(latin1[c]>>1)&1;
122    } else if(c<0x2010) {
123        return FALSE;
124    } else if(c<=0x3030) {
125        uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]];
126        return (UBool)((bits>>(c&0x1f))&1);
127    } else if(0xfd3e<=c && c<=0xfe46) {
128        return c<=0xfd3f || 0xfe45<=c;
129    } else {
130        return FALSE;
131    }
132}
133
134UBool
135PatternProps::isSyntaxOrWhiteSpace(UChar32 c) {
136    if(c<0) {
137        return FALSE;
138    } else if(c<=0xff) {
139        return (UBool)(latin1[c]&1);
140    } else if(c<0x200e) {
141        return FALSE;
142    } else if(c<=0x3030) {
143        uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
144        return (UBool)((bits>>(c&0x1f))&1);
145    } else if(0xfd3e<=c && c<=0xfe46) {
146        return c<=0xfd3f || 0xfe45<=c;
147    } else {
148        return FALSE;
149    }
150}
151
152UBool
153PatternProps::isWhiteSpace(UChar32 c) {
154    if(c<0) {
155        return FALSE;
156    } else if(c<=0xff) {
157        return (UBool)(latin1[c]>>2)&1;
158    } else if(0x200e<=c && c<=0x2029) {
159        return c<=0x200f || 0x2028<=c;
160    } else {
161        return FALSE;
162    }
163}
164
165const UChar *
166PatternProps::skipWhiteSpace(const UChar *s, int32_t length) {
167    while(length>0 && isWhiteSpace(*s)) {
168        ++s;
169        --length;
170    }
171    return s;
172}
173
174const UChar *
175PatternProps::trimWhiteSpace(const UChar *s, int32_t &length) {
176    if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) {
177        return s;
178    }
179    int32_t start=0;
180    int32_t limit=length;
181    while(start<limit && isWhiteSpace(s[start])) {
182        ++start;
183    }
184    if(start<limit) {
185        // There is non-white space at start; we will not move limit below that,
186        // so we need not test start<limit in the loop.
187        while(isWhiteSpace(s[limit-1])) {
188            --limit;
189        }
190    }
191    length=limit-start;
192    return s+start;
193}
194
195UBool
196PatternProps::isIdentifier(const UChar *s, int32_t length) {
197    if(length<=0) {
198        return FALSE;
199    }
200    const UChar *limit=s+length;
201    do {
202        if(isSyntaxOrWhiteSpace(*s++)) {
203            return FALSE;
204        }
205    } while(s<limit);
206    return TRUE;
207}
208
209const UChar *
210PatternProps::skipIdentifier(const UChar *s, int32_t length) {
211    while(length>0 && !isSyntaxOrWhiteSpace(*s)) {
212        ++s;
213        --length;
214    }
215    return s;
216}
217
218U_NAMESPACE_END
219