1/*
2*******************************************************************************
3*   Copyright (C) 2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   created on: 2011feb25
7*   created by: Markus W. Scherer
8*/
9
10package com.ibm.icu.impl;
11
12/**
13 * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
14 * Hardcodes these properties, does not load data, does not depend on other ICU classes.
15 * <p>
16 * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
17 * and both properties only include BMP code points (no supplementary ones).
18 * Pattern_Syntax includes some unassigned code points.
19 * <p>
20 * [:Pattern_White_Space:] =
21 *   [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
22 * <p>
23 * [:Pattern_Syntax:] =
24 *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
25 *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
26 *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
27 *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
28 *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
29 * @author mscherer
30 */
31public final class PatternProps {
32    /**
33     * @return true if c is a Pattern_Syntax code point.
34     */
35    public static boolean isSyntax(int c) {
36        if(c<0) {
37            return false;
38        } else if(c<=0xff) {
39            return latin1[c]==3;
40        } else if(c<0x2010) {
41            return false;
42        } else if(c<=0x3030) {
43            int bits=syntax2000[index2000[(c-0x2000)>>5]];
44            return ((bits>>(c&0x1f))&1)!=0;
45        } else if(0xfd3e<=c && c<=0xfe46) {
46            return c<=0xfd3f || 0xfe45<=c;
47        } else {
48            return false;
49        }
50    }
51
52    /**
53     * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
54     */
55    public static boolean isSyntaxOrWhiteSpace(int c) {
56        if(c<0) {
57            return false;
58        } else if(c<=0xff) {
59            return latin1[c]!=0;
60        } else if(c<0x200e) {
61            return false;
62        } else if(c<=0x3030) {
63            int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
64            return ((bits>>(c&0x1f))&1)!=0;
65        } else if(0xfd3e<=c && c<=0xfe46) {
66            return c<=0xfd3f || 0xfe45<=c;
67        } else {
68            return false;
69        }
70    }
71
72    /**
73     * @return true if c is a Pattern_White_Space character.
74     */
75    public static boolean isWhiteSpace(int c) {
76        if(c<0) {
77            return false;
78        } else if(c<=0xff) {
79            return latin1[c]==5;
80        } else if(0x200e<=c && c<=0x2029) {
81            return c<=0x200f || 0x2028<=c;
82        } else {
83            return false;
84        }
85    }
86
87    /**
88     * Skips over Pattern_White_Space starting at index i of the CharSequence.
89     * @return The smallest index at or after i with a non-white space character.
90     */
91    public static int skipWhiteSpace(CharSequence s, int i) {
92        while(i<s.length() && isWhiteSpace(s.charAt(i))) {
93            ++i;
94        }
95        return i;
96    }
97
98    /**
99     * @return s except with leading and trailing Pattern_White_Space removed.
100     */
101    public static String trimWhiteSpace(String s) {
102        if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
103            return s;
104        }
105        int start=0;
106        int limit=s.length();
107        while(start<limit && isWhiteSpace(s.charAt(start))) {
108            ++start;
109        }
110        if(start<limit) {
111            // There is non-white space at start; we will not move limit below that,
112            // so we need not test start<limit in the loop.
113            while(isWhiteSpace(s.charAt(limit-1))) {
114                --limit;
115            }
116        }
117        return s.substring(start, limit);
118    }
119
120    /**
121     * Tests whether the CharSequence contains a "pattern identifier", that is,
122     * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
123     * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
124     */
125    public static boolean isIdentifier(CharSequence s) {
126        int limit=s.length();
127        if(limit==0) {
128            return false;
129        }
130        int start=0;
131        do {
132            if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
133                return false;
134            }
135        } while(start<limit);
136        return true;
137    }
138
139    /**
140     * Tests whether the CharSequence contains a "pattern identifier", that is,
141     * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
142     * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
143     *         in s between start and (exclusive) limit.
144     */
145    public static boolean isIdentifier(CharSequence s, int start, int limit) {
146        if(start>=limit) {
147            return false;
148        }
149        do {
150            if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
151                return false;
152            }
153        } while(start<limit);
154        return true;
155    }
156
157    /**
158     * Skips over a "pattern identifier" starting at index i of the CharSequence.
159     * @return The smallest index at or after i with
160     *         a Pattern_White_Space or Pattern_Syntax character.
161     */
162    public static int skipIdentifier(CharSequence s, int i) {
163        while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
164            ++i;
165        }
166        return i;
167    }
168
169    /*
170     * One byte per Latin-1 character.
171     * Bit 0 is set if either Pattern property is true,
172     * bit 1 if Pattern_Syntax is true,
173     * bit 2 if Pattern_White_Space is true.
174     * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
175     */
176    private static final byte latin1[]=new byte[] {  // 256
177        // WS: 9..D
178        0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
179        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180        // WS: 20  Syntax: 21..2F
181        5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
182        // Syntax: 3A..40
183        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
184        3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185        // Syntax: 5B..5E
186        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
187        // Syntax: 60
188        3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189        // Syntax: 7B..7E
190        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
191        // WS: 85
192        0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194        // Syntax: A1..A7, A9, AB, AC, AE
195        0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
196        // Syntax: B0, B1, B6, BB, BF
197        3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
198        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199        // Syntax: D7
200        0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
201        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202        // Syntax: F7
203        0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
204    };
205
206    /*
207     * One byte per 32 characters from U+2000..U+303F indexing into
208     * a small table of 32-bit data words.
209     * The first two data words are all-zeros and all-ones.
210     */
211    private static final byte index2000[]=new byte[] {  // 130
212        2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
213        0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
214        1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
215        1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
216        1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
217        1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
218        1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
219        1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
220        1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
221        1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
222        1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
223        1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
224        0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
225        0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
226        1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
227        0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
228        8, 9  // 3000..303F
229    };
230
231    /*
232     * One 32-bit integer per 32 characters. Ranges of all-false and all-true
233     * are mapped to the first two values, other ranges map to appropriate bit patterns.
234     */
235    private static final int syntax2000[]=new int[] {
236        0,
237        -1,
238        0xffff0000,  // 2: 2010..201F
239        0x7fff00ff,  // 3: 2020..2027, 2030..203E
240        0x7feffffe,  // 4: 2041..2053, 2055..205E
241        0xffff0000,  // 5: 2190..219F
242        0x003fffff,  // 6: 2760..2775
243        0xfff00000,  // 7: 2794..279F
244        0xffffff0e,  // 8: 3001..3003, 3008..301F
245        0x00010001   // 9: 3020, 3030
246    };
247
248    /*
249     * Same as syntax2000, but with additional bits set for the
250     * Pattern_White_Space characters 200E 200F 2028 2029.
251     */
252    private static final int syntaxOrWhiteSpace2000[]=new int[] {
253        0,
254        -1,
255        0xffffc000,  // 2: 200E..201F
256        0x7fff03ff,  // 3: 2020..2029, 2030..203E
257        0x7feffffe,  // 4: 2041..2053, 2055..205E
258        0xffff0000,  // 5: 2190..219F
259        0x003fffff,  // 6: 2760..2775
260        0xfff00000,  // 7: 2794..279F
261        0xffffff0e,  // 8: 3001..3003, 3008..301F
262        0x00010001   // 9: 3020, 3030
263    };
264}
265