1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5*   Copyright (C) 2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   created on: 2011feb25
9*   created by: Markus W. Scherer
10*/
11
12package com.ibm.icu.impl;
13
14/**
15 * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
16 * Hardcodes these properties, does not load data, does not depend on other ICU classes.
17 * <p>
18 * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
19 * and both properties only include BMP code points (no supplementary ones).
20 * Pattern_Syntax includes some unassigned code points.
21 * <p>
22 * [:Pattern_White_Space:] =
23 *   [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
24 * <p>
25 * [:Pattern_Syntax:] =
26 *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
27 *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
28 *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
29 *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
30 *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
31 * @author mscherer
32 */
33public final class PatternProps {
34    /**
35     * @return true if c is a Pattern_Syntax code point.
36     */
37    public static boolean isSyntax(int c) {
38        if(c<0) {
39            return false;
40        } else if(c<=0xff) {
41            return latin1[c]==3;
42        } else if(c<0x2010) {
43            return false;
44        } else if(c<=0x3030) {
45            int bits=syntax2000[index2000[(c-0x2000)>>5]];
46            return ((bits>>(c&0x1f))&1)!=0;
47        } else if(0xfd3e<=c && c<=0xfe46) {
48            return c<=0xfd3f || 0xfe45<=c;
49        } else {
50            return false;
51        }
52    }
53
54    /**
55     * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
56     */
57    public static boolean isSyntaxOrWhiteSpace(int c) {
58        if(c<0) {
59            return false;
60        } else if(c<=0xff) {
61            return latin1[c]!=0;
62        } else if(c<0x200e) {
63            return false;
64        } else if(c<=0x3030) {
65            int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
66            return ((bits>>(c&0x1f))&1)!=0;
67        } else if(0xfd3e<=c && c<=0xfe46) {
68            return c<=0xfd3f || 0xfe45<=c;
69        } else {
70            return false;
71        }
72    }
73
74    /**
75     * @return true if c is a Pattern_White_Space character.
76     */
77    public static boolean isWhiteSpace(int c) {
78        if(c<0) {
79            return false;
80        } else if(c<=0xff) {
81            return latin1[c]==5;
82        } else if(0x200e<=c && c<=0x2029) {
83            return c<=0x200f || 0x2028<=c;
84        } else {
85            return false;
86        }
87    }
88
89    /**
90     * Skips over Pattern_White_Space starting at index i of the CharSequence.
91     * @return The smallest index at or after i with a non-white space character.
92     */
93    public static int skipWhiteSpace(CharSequence s, int i) {
94        while(i<s.length() && isWhiteSpace(s.charAt(i))) {
95            ++i;
96        }
97        return i;
98    }
99
100    /**
101     * @return s except with leading and trailing Pattern_White_Space removed.
102     */
103    public static String trimWhiteSpace(String s) {
104        if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
105            return s;
106        }
107        int start=0;
108        int limit=s.length();
109        while(start<limit && isWhiteSpace(s.charAt(start))) {
110            ++start;
111        }
112        if(start<limit) {
113            // There is non-white space at start; we will not move limit below that,
114            // so we need not test start<limit in the loop.
115            while(isWhiteSpace(s.charAt(limit-1))) {
116                --limit;
117            }
118        }
119        return s.substring(start, limit);
120    }
121
122    /**
123     * Tests whether the CharSequence contains a "pattern identifier", that is,
124     * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
125     * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
126     */
127    public static boolean isIdentifier(CharSequence s) {
128        int limit=s.length();
129        if(limit==0) {
130            return false;
131        }
132        int start=0;
133        do {
134            if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
135                return false;
136            }
137        } while(start<limit);
138        return true;
139    }
140
141    /**
142     * Tests whether the CharSequence contains a "pattern identifier", that is,
143     * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
144     * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
145     *         in s between start and (exclusive) limit.
146     */
147    public static boolean isIdentifier(CharSequence s, int start, int limit) {
148        if(start>=limit) {
149            return false;
150        }
151        do {
152            if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
153                return false;
154            }
155        } while(start<limit);
156        return true;
157    }
158
159    /**
160     * Skips over a "pattern identifier" starting at index i of the CharSequence.
161     * @return The smallest index at or after i with
162     *         a Pattern_White_Space or Pattern_Syntax character.
163     */
164    public static int skipIdentifier(CharSequence s, int i) {
165        while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
166            ++i;
167        }
168        return i;
169    }
170
171    /*
172     * One byte per Latin-1 character.
173     * Bit 0 is set if either Pattern property is true,
174     * bit 1 if Pattern_Syntax is true,
175     * bit 2 if Pattern_White_Space is true.
176     * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
177     */
178    private static final byte latin1[]=new byte[] {  // 256
179        // WS: 9..D
180        0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
181        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182        // WS: 20  Syntax: 21..2F
183        5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
184        // Syntax: 3A..40
185        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
186        3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187        // Syntax: 5B..5E
188        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
189        // Syntax: 60
190        3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191        // Syntax: 7B..7E
192        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
193        // WS: 85
194        0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196        // Syntax: A1..A7, A9, AB, AC, AE
197        0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
198        // Syntax: B0, B1, B6, BB, BF
199        3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
200        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201        // Syntax: D7
202        0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
203        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204        // Syntax: F7
205        0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
206    };
207
208    /*
209     * One byte per 32 characters from U+2000..U+303F indexing into
210     * a small table of 32-bit data words.
211     * The first two data words are all-zeros and all-ones.
212     */
213    private static final byte index2000[]=new byte[] {  // 130
214        2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
215        0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
216        1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
217        1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
218        1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
219        1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
220        1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
221        1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
222        1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
223        1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
224        1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
225        1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
226        0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
227        0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
228        1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
229        0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
230        8, 9  // 3000..303F
231    };
232
233    /*
234     * One 32-bit integer per 32 characters. Ranges of all-false and all-true
235     * are mapped to the first two values, other ranges map to appropriate bit patterns.
236     */
237    private static final int syntax2000[]=new int[] {
238        0,
239        -1,
240        0xffff0000,  // 2: 2010..201F
241        0x7fff00ff,  // 3: 2020..2027, 2030..203E
242        0x7feffffe,  // 4: 2041..2053, 2055..205E
243        0xffff0000,  // 5: 2190..219F
244        0x003fffff,  // 6: 2760..2775
245        0xfff00000,  // 7: 2794..279F
246        0xffffff0e,  // 8: 3001..3003, 3008..301F
247        0x00010001   // 9: 3020, 3030
248    };
249
250    /*
251     * Same as syntax2000, but with additional bits set for the
252     * Pattern_White_Space characters 200E 200F 2028 2029.
253     */
254    private static final int syntaxOrWhiteSpace2000[]=new int[] {
255        0,
256        -1,
257        0xffffc000,  // 2: 200E..201F
258        0x7fff03ff,  // 3: 2020..2029, 2030..203E
259        0x7feffffe,  // 4: 2041..2053, 2055..205E
260        0xffff0000,  // 5: 2190..219F
261        0x003fffff,  // 6: 2760..2775
262        0xfff00000,  // 7: 2794..279F
263        0xffffff0e,  // 8: 3001..3003, 3008..301F
264        0x00010001   // 9: 3020, 3030
265    };
266}
267