17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and others. All Rights Reserved.                                            *
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl;
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UTF16;
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet;
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The '' (two quotes) is treated as a single quote, inside or outside a quote
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <ul>
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Any ignorable characters are ignored in parsing.</li>
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Any syntax characters are broken into separate tokens</li>
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Quote characters can be specified: '...', "...", and \x </li>
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Other characters are treated as literals</li>
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </ul>
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic class PatternTokenizer {
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // settings used in the interpretation of the pattern
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private UnicodeSet ignorableCharacters = new UnicodeSet();
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private UnicodeSet syntaxCharacters = new UnicodeSet();
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private UnicodeSet extraQuotingCharacters = new UnicodeSet();
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private UnicodeSet escapeCharacters = new UnicodeSet();
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private boolean usingSlash = false;
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private boolean usingQuote = false;
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // transient data, set when needed. Null it out for any changes in the above fields.
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private transient UnicodeSet needingQuoteCharacters = null;
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // data about the current pattern being parsed. start gets moved as we go along.
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private int start;
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private int limit;
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private String pattern;
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public UnicodeSet getIgnorableCharacters() {
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (UnicodeSet) ignorableCharacters.clone();
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param ignorableCharacters Characters to be ignored.
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return A PatternTokenizer object in which characters are specified as ignored characters.
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        needingQuoteCharacters = null;
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public UnicodeSet getSyntaxCharacters() {
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (UnicodeSet) syntaxCharacters.clone();
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public UnicodeSet getExtraQuotingCharacters() {
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (UnicodeSet) extraQuotingCharacters.clone();
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param syntaxCharacters Characters to be set as syntax characters.
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return A PatternTokenizer object in which characters are specified as syntax characters.
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        needingQuoteCharacters = null;
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *  Sets the extra characters to be quoted in literals
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param syntaxCharacters Characters to be set as extra quoting characters.
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        needingQuoteCharacters = null;
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public UnicodeSet getEscapeCharacters() {
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (UnicodeSet) escapeCharacters.clone();
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param escapeCharacters Characters to be set as escape characters.
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return A PatternTokenizer object in which characters are specified as escape characters.
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public boolean isUsingQuote() {
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return usingQuote;
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setUsingQuote(boolean usingQuote) {
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.usingQuote = usingQuote;
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        needingQuoteCharacters = null;
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public boolean isUsingSlash() {
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return usingSlash;
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setUsingSlash(boolean usingSlash) {
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.usingSlash = usingSlash;
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        needingQuoteCharacters = null;
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    //    public UnicodeSet getQuoteCharacters() {
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  return (UnicodeSet) quoteCharacters.clone();
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  }
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  needingQuoteCharacters = null;
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  return this;
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//  }
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int getLimit() {
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return limit;
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setLimit(int limit) {
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.limit = limit;
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int getStart() {
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return start;
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setStart(int start) {
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.start = start;
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setPattern(CharSequence pattern) {
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return setPattern(pattern.toString());
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public PatternTokenizer setPattern(String pattern) {
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (pattern == null) {
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            throw new IllegalArgumentException("Inconsistent arguments");
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.start = 0;
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.limit = pattern.length();
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.pattern = pattern;
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return this;
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final char SINGLE_QUOTE = '\'';
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final char BACK_SLASH = '\\';
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static int NO_QUOTE = -1, IN_QUOTE = -2;
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public String quoteLiteral(CharSequence string) {
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return quoteLiteral(string.toString());
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param string String passed to quote a literal string.
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public String quoteLiteral(String string) {
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (needingQuoteCharacters == null) {
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        StringBuffer result = new StringBuffer();
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int quotedChar = NO_QUOTE;
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cp = UTF16.charAt(string, i);
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (escapeCharacters.contains(cp)) {
1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // we may have to fix up previous characters
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (quotedChar == IN_QUOTE) {
1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.append(SINGLE_QUOTE);
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    quotedChar = NO_QUOTE;
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                appendEscaped(result, cp);
1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                continue;
1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (needingQuoteCharacters.contains(cp)) {
1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // if we have already started a quote
1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (quotedChar == IN_QUOTE) {
1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    UTF16.append(result, cp);
1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (usingQuote && cp == SINGLE_QUOTE) { // double it
1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        result.append(SINGLE_QUOTE);
1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // otherwise not already in quote
1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (usingSlash) {
1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.append(BACK_SLASH);
1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    UTF16.append(result, cp);
1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (usingQuote) {
1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (cp == SINGLE_QUOTE) { // double it and continue
1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        result.append(SINGLE_QUOTE);
1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        result.append(SINGLE_QUOTE);
1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        continue;
1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.append(SINGLE_QUOTE);
2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    UTF16.append(result, cp);
2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    quotedChar = IN_QUOTE;
2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // we have no choice but to use \\u or \\U
2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                appendEscaped(result, cp);
2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                continue;
2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // otherwise cp doesn't need quoting
2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // we may have to fix up previous characters
2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (quotedChar == IN_QUOTE) {
2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                result.append(SINGLE_QUOTE);
2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                quotedChar = NO_QUOTE;
2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            UTF16.append(result, cp);
2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // all done.
2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // we may have to fix up previous characters
2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (quotedChar == IN_QUOTE) {
2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            result.append(SINGLE_QUOTE);
2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return result.toString();
2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private void appendEscaped(StringBuffer result, int cp) {
2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (cp <= 0xFFFF) {
2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            result.append("\\u").append(Utility.hex(cp,4));
2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {
2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            result.append("\\U").append(Utility.hex(cp,8));
2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public String normalize() {
2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int oldStart = start;
2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        StringBuffer result = new StringBuffer();
2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        StringBuffer buffer = new StringBuffer();
2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while (true) {
2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            buffer.setLength(0);
2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int status = next(buffer);
2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (status == DONE) {
2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                start = oldStart;
2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return result.toString();
2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (status != SYNTAX) {
2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                result.append(quoteLiteral(buffer));
2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                result.append(buffer);
2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int next(StringBuffer buffer) {
2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (start >= limit) return DONE;
2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int status = UNKNOWN;
2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int lastQuote = UNKNOWN;
2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int quoteStatus = NONE;
2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int hexCount = 0;
2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int hexValue = 0;
2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        main:
2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                cp = UTF16.charAt(pattern, i);
2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // if we are in a quote, then handle it.
2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                switch (quoteStatus) {
2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                case SLASH_START:
2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    switch (cp) {
2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    case 'u':
2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        quoteStatus = HEX;
2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexCount = 4;
2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexValue = 0;
2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        continue main;
2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    case 'U':
2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        quoteStatus = HEX;
2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexCount = 8;
2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexValue = 0;
2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        continue main;
2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    default:
2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        if (usingSlash) {
2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            UTF16.append(buffer, cp);
2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            quoteStatus = NONE;
2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            continue main;
2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        } else {
2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            buffer.append(BACK_SLASH);
2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            quoteStatus = NONE;
2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        }
2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break; // fall through to NONE
2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                case HEX:
2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    hexValue <<= 4;
2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    hexValue += cp;
2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    switch (cp) {
2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexValue -= '0'; break;
2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexValue -= 'a' - 10; break;
2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        hexValue -= 'A' - 10; break;
3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    default:
3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        start = i;
3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return BROKEN_ESCAPE;
3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    --hexCount;
3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (hexCount == 0) {
3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        quoteStatus = NONE;
3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        UTF16.append(buffer, hexValue);
3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue main;
3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                case AFTER_QUOTE:
3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // see if we get another quote character
3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (cp == lastQuote) {
3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        UTF16.append(buffer, cp);
3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        quoteStatus = NORMAL_QUOTE;
3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        continue main;
3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    quoteStatus = NONE;
3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break; // fall through to NONE
3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                case START_QUOTE:
3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (cp == lastQuote) {
3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        UTF16.append(buffer, cp);
3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        quoteStatus = NONE; // get out of quote, with no trace remaining
3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        continue;
3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // otherwise get into quote
3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    UTF16.append(buffer, cp);
3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    quoteStatus = NORMAL_QUOTE;
3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue main;
3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                case NORMAL_QUOTE:
3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (cp == lastQuote) {
3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        quoteStatus = AFTER_QUOTE; // get out of quote
3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        continue main;
3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    UTF16.append(buffer, cp);
3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue main;
3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (ignorableCharacters.contains(cp)) {
3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // do syntax characters
3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (syntaxCharacters.contains(cp)) {
3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (status == UNKNOWN) {
3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        UTF16.append(buffer, cp);
3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        start = i + UTF16.getCharCount(cp);
3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        return SYNTAX;
3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    } else { // LITERAL, so back up and break
3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        start = i;
3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        return status;
3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // otherwise it is a literal; keep on going
3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                status = LITERAL;
3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (cp == BACK_SLASH) {
3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    quoteStatus = SLASH_START;
3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else if (usingQuote && cp == SINGLE_QUOTE) {
3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    lastQuote = cp;
3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    quoteStatus = START_QUOTE;
3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // normal literals
3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                UTF16.append(buffer, cp);
3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // handle final cleanup
3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        start = limit;
3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        switch (quoteStatus) {
3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        case HEX:
3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            status = BROKEN_ESCAPE;
3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            break;
3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        case SLASH_START:
3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (usingSlash) {
3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                status = BROKEN_ESCAPE;
3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                buffer.append(BACK_SLASH);
3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            break;
3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        case START_QUOTE: case NORMAL_QUOTE:
3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            status = BROKEN_QUOTE;
3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            break;
3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return status;
3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//eof
391