1/*
2 *******************************************************************************
3 * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
4 * and others. All Rights Reserved.                                            *
5 *******************************************************************************
6 */
7package com.ibm.icu.impl;
8
9import com.ibm.icu.text.UTF16;
10import com.ibm.icu.text.UnicodeSet;
11
12/**
13 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
14 * The '' (two quotes) is treated as a single quote, inside or outside a quote
15 * <ul>
16 * <li>Any ignorable characters are ignored in parsing.</li>
17 * <li>Any syntax characters are broken into separate tokens</li>
18 * <li>Quote characters can be specified: '...', "...", and \x </li>
19 * <li>Other characters are treated as literals</li>
20 * </ul>
21 */
22public class PatternTokenizer {
23    // settings used in the interpretation of the pattern
24    private UnicodeSet ignorableCharacters = new UnicodeSet();
25    private UnicodeSet syntaxCharacters = new UnicodeSet();
26    private UnicodeSet extraQuotingCharacters = new UnicodeSet();
27    private UnicodeSet escapeCharacters = new UnicodeSet();
28    private boolean usingSlash = false;
29    private boolean usingQuote = false;
30
31    // transient data, set when needed. Null it out for any changes in the above fields.
32    private transient UnicodeSet needingQuoteCharacters = null;
33
34    // data about the current pattern being parsed. start gets moved as we go along.
35    private int start;
36    private int limit;
37    private String pattern;
38
39    public UnicodeSet getIgnorableCharacters() {
40        return (UnicodeSet) ignorableCharacters.clone();
41    }
42    /**
43     * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
44     * @param ignorableCharacters Characters to be ignored.
45     * @return A PatternTokenizer object in which characters are specified as ignored characters.
46     */
47    public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
48        this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
49        needingQuoteCharacters = null;
50        return this;
51    }
52    public UnicodeSet getSyntaxCharacters() {
53        return (UnicodeSet) syntaxCharacters.clone();
54    }
55    public UnicodeSet getExtraQuotingCharacters() {
56        return (UnicodeSet) extraQuotingCharacters.clone();
57    }
58    /**
59     *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
60     * @param syntaxCharacters Characters to be set as syntax characters.
61     * @return A PatternTokenizer object in which characters are specified as syntax characters.
62     */
63    public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
64        this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
65        needingQuoteCharacters = null;
66        return this;
67    }
68    /**
69     *  Sets the extra characters to be quoted in literals
70     * @param syntaxCharacters Characters to be set as extra quoting characters.
71     * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
72     */
73    public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
74        this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
75        needingQuoteCharacters = null;
76        return this;
77    }
78
79    public UnicodeSet getEscapeCharacters() {
80        return (UnicodeSet) escapeCharacters.clone();
81    }
82    /**
83     * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
84     * @param escapeCharacters Characters to be set as escape characters.
85     * @return A PatternTokenizer object in which characters are specified as escape characters.
86     */
87    public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
88        this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
89        return this;
90    }
91    public boolean isUsingQuote() {
92        return usingQuote;
93    }
94    public PatternTokenizer setUsingQuote(boolean usingQuote) {
95        this.usingQuote = usingQuote;
96        needingQuoteCharacters = null;
97        return this;
98    }
99    public boolean isUsingSlash() {
100        return usingSlash;
101    }
102    public PatternTokenizer setUsingSlash(boolean usingSlash) {
103        this.usingSlash = usingSlash;
104        needingQuoteCharacters = null;
105        return this;
106    }
107    //    public UnicodeSet getQuoteCharacters() {
108//  return (UnicodeSet) quoteCharacters.clone();
109//  }
110//  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
111//  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
112//  needingQuoteCharacters = null;
113//  return this;
114//  }
115    public int getLimit() {
116        return limit;
117    }
118    public PatternTokenizer setLimit(int limit) {
119        this.limit = limit;
120        return this;
121    }
122    public int getStart() {
123        return start;
124    }
125    public PatternTokenizer setStart(int start) {
126        this.start = start;
127        return this;
128    }
129
130    public PatternTokenizer setPattern(CharSequence pattern) {
131        return setPattern(pattern.toString());
132    }
133
134    public PatternTokenizer setPattern(String pattern) {
135        if (pattern == null) {
136            throw new IllegalArgumentException("Inconsistent arguments");
137        }
138        this.start = 0;
139        this.limit = pattern.length();
140        this.pattern = pattern;
141        return this;
142    }
143
144    public static final char SINGLE_QUOTE = '\'';
145    public static final char BACK_SLASH = '\\';
146    private static int NO_QUOTE = -1, IN_QUOTE = -2;
147
148    public String quoteLiteral(CharSequence string) {
149        return quoteLiteral(string.toString());
150    }
151
152    /**
153     * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
154     * @param string String passed to quote a literal string.
155     * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
156     */
157    public String quoteLiteral(String string) {
158        if (needingQuoteCharacters == null) {
159            needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
160            if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
161            if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
162        }
163        StringBuffer result = new StringBuffer();
164        int quotedChar = NO_QUOTE;
165        int cp;
166        for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
167            cp = UTF16.charAt(string, i);
168            if (escapeCharacters.contains(cp)) {
169                // we may have to fix up previous characters
170                if (quotedChar == IN_QUOTE) {
171                    result.append(SINGLE_QUOTE);
172                    quotedChar = NO_QUOTE;
173                }
174                appendEscaped(result, cp);
175                continue;
176            }
177
178            if (needingQuoteCharacters.contains(cp)) {
179                // if we have already started a quote
180                if (quotedChar == IN_QUOTE) {
181                    UTF16.append(result, cp);
182                    if (usingQuote && cp == SINGLE_QUOTE) { // double it
183                        result.append(SINGLE_QUOTE);
184                    }
185                    continue;
186                }
187                // otherwise not already in quote
188                if (usingSlash) {
189                    result.append(BACK_SLASH);
190                    UTF16.append(result, cp);
191                    continue;
192                }
193                if (usingQuote) {
194                    if (cp == SINGLE_QUOTE) { // double it and continue
195                        result.append(SINGLE_QUOTE);
196                        result.append(SINGLE_QUOTE);
197                        continue;
198                    }
199                    result.append(SINGLE_QUOTE);
200                    UTF16.append(result, cp);
201                    quotedChar = IN_QUOTE;
202                    continue;
203                }
204                // we have no choice but to use \\u or \\U
205                appendEscaped(result, cp);
206                continue;
207            }
208            // otherwise cp doesn't need quoting
209            // we may have to fix up previous characters
210            if (quotedChar == IN_QUOTE) {
211                result.append(SINGLE_QUOTE);
212                quotedChar = NO_QUOTE;
213            }
214            UTF16.append(result, cp);
215        }
216        // all done.
217        // we may have to fix up previous characters
218        if (quotedChar == IN_QUOTE) {
219            result.append(SINGLE_QUOTE);
220        }
221        return result.toString();
222    }
223
224    private void appendEscaped(StringBuffer result, int cp) {
225        if (cp <= 0xFFFF) {
226            result.append("\\u").append(Utility.hex(cp,4));
227        } else {
228            result.append("\\U").append(Utility.hex(cp,8));
229        }
230    }
231
232    public String normalize() {
233        int oldStart = start;
234        StringBuffer result = new StringBuffer();
235        StringBuffer buffer = new StringBuffer();
236        while (true) {
237            buffer.setLength(0);
238            int status = next(buffer);
239            if (status == DONE) {
240                start = oldStart;
241                return result.toString();
242            }
243            if (status != SYNTAX) {
244                result.append(quoteLiteral(buffer));
245            } else {
246                result.append(buffer);
247            }
248        }
249    }
250
251    public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
252
253    private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
254
255    public int next(StringBuffer buffer) {
256        if (start >= limit) return DONE;
257        int status = UNKNOWN;
258        int lastQuote = UNKNOWN;
259        int quoteStatus = NONE;
260        int hexCount = 0;
261        int hexValue = 0;
262        int cp;
263        main:
264            for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
265                cp = UTF16.charAt(pattern, i);
266                // if we are in a quote, then handle it.
267                switch (quoteStatus) {
268                case SLASH_START:
269                    switch (cp) {
270                    case 'u':
271                        quoteStatus = HEX;
272                        hexCount = 4;
273                        hexValue = 0;
274                        continue main;
275                    case 'U':
276                        quoteStatus = HEX;
277                        hexCount = 8;
278                        hexValue = 0;
279                        continue main;
280                    default:
281                        if (usingSlash) {
282                            UTF16.append(buffer, cp);
283                            quoteStatus = NONE;
284                            continue main;
285                        } else {
286                            buffer.append(BACK_SLASH);
287                            quoteStatus = NONE;
288                        }
289                    }
290                    break; // fall through to NONE
291                case HEX:
292                    hexValue <<= 4;
293                    hexValue += cp;
294                    switch (cp) {
295                    case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
296                        hexValue -= '0'; break;
297                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
298                        hexValue -= 'a' - 10; break;
299                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
300                        hexValue -= 'A' - 10; break;
301                    default:
302                        start = i;
303                    return BROKEN_ESCAPE;
304                    }
305                    --hexCount;
306                    if (hexCount == 0) {
307                        quoteStatus = NONE;
308                        UTF16.append(buffer, hexValue);
309                    }
310                    continue main;
311                case AFTER_QUOTE:
312                    // see if we get another quote character
313                    // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
314                    if (cp == lastQuote) {
315                        UTF16.append(buffer, cp);
316                        quoteStatus = NORMAL_QUOTE;
317                        continue main;
318                    }
319                    quoteStatus = NONE;
320                    break; // fall through to NONE
321                case START_QUOTE:
322                    // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
323                    if (cp == lastQuote) {
324                        UTF16.append(buffer, cp);
325                        quoteStatus = NONE; // get out of quote, with no trace remaining
326                        continue;
327                    }
328                    // otherwise get into quote
329                    UTF16.append(buffer, cp);
330                    quoteStatus = NORMAL_QUOTE;
331                    continue main;
332                case NORMAL_QUOTE:
333                    if (cp == lastQuote) {
334                        quoteStatus = AFTER_QUOTE; // get out of quote
335                        continue main;
336                    }
337                    UTF16.append(buffer, cp);
338                    continue main;
339                }
340
341                if (ignorableCharacters.contains(cp)) {
342                    continue;
343                }
344                // do syntax characters
345                if (syntaxCharacters.contains(cp)) {
346                    if (status == UNKNOWN) {
347                        UTF16.append(buffer, cp);
348                        start = i + UTF16.getCharCount(cp);
349                        return SYNTAX;
350                    } else { // LITERAL, so back up and break
351                        start = i;
352                        return status;
353                    }
354                }
355                // otherwise it is a literal; keep on going
356                status = LITERAL;
357                if (cp == BACK_SLASH) {
358                    quoteStatus = SLASH_START;
359                    continue;
360                } else if (usingQuote && cp == SINGLE_QUOTE) {
361                    lastQuote = cp;
362                    quoteStatus = START_QUOTE;
363                    continue;
364                }
365                // normal literals
366                UTF16.append(buffer, cp);
367            }
368        // handle final cleanup
369        start = limit;
370        switch (quoteStatus) {
371        case HEX:
372            status = BROKEN_ESCAPE;
373            break;
374        case SLASH_START:
375            if (usingSlash) {
376                status = BROKEN_ESCAPE;
377            } else {
378                buffer.append(BACK_SLASH);
379            }
380            break;
381        case START_QUOTE: case NORMAL_QUOTE:
382            status = BROKEN_QUOTE;
383            break;
384        }
385        return status;
386    }
387
388
389}
390//eof
391