icu/impl/PatternTokenizer.java

/*
 *******************************************************************************
 * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
 * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */
package com.ibm.icu.impl;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
 * The '' (two quotes) is treated as a single quote, inside or outside a quote
 * <ul>
 * <li>Any ignorable characters are ignored in parsing.</li>
 * <li>Any syntax characters are broken into separate tokens</li>
 * <li>Quote characters can be specified: '...', "...", and \x </li>
 * <li>Other characters are treated as literals</li>
 * </ul>
 */
public class PatternTokenizer {
    // settings used in the interpretation of the pattern
    private UnicodeSet ignorableCharacters = new UnicodeSet();
    private UnicodeSet syntaxCharacters = new UnicodeSet();
    private UnicodeSet extraQuotingCharacters = new UnicodeSet();
    private UnicodeSet escapeCharacters = new UnicodeSet();
    private boolean usingSlash = false;
    private boolean usingQuote = false;

    // transient data, set when needed. Null it out for any changes in the above fields.
    private transient UnicodeSet needingQuoteCharacters = null;

    // data about the current pattern being parsed. start gets moved as we go along.
    private int start;
    private int limit;
    private String pattern;

    public UnicodeSet getIgnorableCharacters() {
        return (UnicodeSet) ignorableCharacters.clone();
    }
    /**
     * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
     * @param ignorableCharacters Characters to be ignored.
     * @return A PatternTokenizer object in which characters are specified as ignored characters.
     */
    public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
        this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
        needingQuoteCharacters = null;
        return this;
    }
    public UnicodeSet getSyntaxCharacters() {
        return (UnicodeSet) syntaxCharacters.clone();
    }
    public UnicodeSet getExtraQuotingCharacters() {
        return (UnicodeSet) extraQuotingCharacters.clone();
    }
    /**
     *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
     * @param syntaxCharacters Characters to be set as syntax characters.
     * @return A PatternTokenizer object in which characters are specified as syntax characters.
     */
    public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
        this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
        needingQuoteCharacters = null;
        return this;
    }
    /**
     *  Sets the extra characters to be quoted in literals
     * @param syntaxCharacters Characters to be set as extra quoting characters.
     * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
     */
    public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
        this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
        needingQuoteCharacters = null;
        return this;
    }

    public UnicodeSet getEscapeCharacters() {
        return (UnicodeSet) escapeCharacters.clone();
    }
    /**
     * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
     * @param escapeCharacters Characters to be set as escape characters.
     * @return A PatternTokenizer object in which characters are specified as escape characters.
     */
    public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
        this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
        return this;
    }
    public boolean isUsingQuote() {
        return usingQuote;
    }
    public PatternTokenizer setUsingQuote(boolean usingQuote) {
        this.usingQuote = usingQuote;
        needingQuoteCharacters = null;
        return this;
    }
    public boolean isUsingSlash() {
        return usingSlash;
    }
    public PatternTokenizer setUsingSlash(boolean usingSlash) {
        this.usingSlash = usingSlash;
        needingQuoteCharacters = null;
        return this;
    }
    //    public UnicodeSet getQuoteCharacters() {
//  return (UnicodeSet) quoteCharacters.clone();
//  }
//  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
//  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
//  needingQuoteCharacters = null;
//  return this;
//  }
    public int getLimit() {
        return limit;
    }
    public PatternTokenizer setLimit(int limit) {
        this.limit = limit;
        return this;
    }
    public int getStart() {
        return start;
    }
    public PatternTokenizer setStart(int start) {
        this.start = start;
        return this;
    }

    public PatternTokenizer setPattern(CharSequence pattern) {
        return setPattern(pattern.toString());
    }

    public PatternTokenizer setPattern(String pattern) {
        if (pattern == null) {
            throw new IllegalArgumentException("Inconsistent arguments");
        }
        this.start = 0;
        this.limit = pattern.length();
        this.pattern = pattern;
        return this;
    }

    public static final char SINGLE_QUOTE = '\'';
    public static final char BACK_SLASH = '\\';
    private static int NO_QUOTE = -1, IN_QUOTE = -2;

    public String quoteLiteral(CharSequence string) {
        return quoteLiteral(string.toString());
    }

    /**
     * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
     * @param string String passed to quote a literal string.
     * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
     */
    public String quoteLiteral(String string) {
        if (needingQuoteCharacters == null) {
            needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
            if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
            if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
        }
        StringBuffer result = new StringBuffer();
        int quotedChar = NO_QUOTE;
        int cp;
        for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
            cp = UTF16.charAt(string, i);
            if (escapeCharacters.contains(cp)) {
                // we may have to fix up previous characters
                if (quotedChar == IN_QUOTE) {
                    result.append(SINGLE_QUOTE);
                    quotedChar = NO_QUOTE;
                }
                appendEscaped(result, cp);
                continue;
            }

            if (needingQuoteCharacters.contains(cp)) {
                // if we have already started a quote
                if (quotedChar == IN_QUOTE) {
                    UTF16.append(result, cp);
                    if (usingQuote && cp == SINGLE_QUOTE) { // double it
                        result.append(SINGLE_QUOTE);
                    }
                    continue;
                }
                // otherwise not already in quote
                if (usingSlash) {
                    result.append(BACK_SLASH);
                    UTF16.append(result, cp);
                    continue;
                }
                if (usingQuote) {
                    if (cp == SINGLE_QUOTE) { // double it and continue
                        result.append(SINGLE_QUOTE);
                        result.append(SINGLE_QUOTE);
                        continue;
                    }
                    result.append(SINGLE_QUOTE);
                    UTF16.append(result, cp);
                    quotedChar = IN_QUOTE;
                    continue;
                }
                // we have no choice but to use \\u or \\U
                appendEscaped(result, cp);
                continue;
            }
            // otherwise cp doesn't need quoting
            // we may have to fix up previous characters
            if (quotedChar == IN_QUOTE) {
                result.append(SINGLE_QUOTE);
                quotedChar = NO_QUOTE;
            }
            UTF16.append(result, cp);
        }
        // all done.
        // we may have to fix up previous characters
        if (quotedChar == IN_QUOTE) {
            result.append(SINGLE_QUOTE);
        }
        return result.toString();
    }

    private void appendEscaped(StringBuffer result, int cp) {
        if (cp <= 0xFFFF) {
            result.append("\\u").append(Utility.hex(cp,4));
        } else {
            result.append("\\U").append(Utility.hex(cp,8));
        }
    }

    public String normalize() {
        int oldStart = start;
        StringBuffer result = new StringBuffer();
        StringBuffer buffer = new StringBuffer();
        while (true) {
            buffer.setLength(0);
            int status = next(buffer);
            if (status == DONE) {
                start = oldStart;
                return result.toString();
            }
            if (status != SYNTAX) {
                result.append(quoteLiteral(buffer));
            } else {
                result.append(buffer);
            }
        }
    }

    public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;

    private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;

    public int next(StringBuffer buffer) {
        if (start >= limit) return DONE;
        int status = UNKNOWN;
        int lastQuote = UNKNOWN;
        int quoteStatus = NONE;
        int hexCount = 0;
        int hexValue = 0;
        int cp;
        main:
            for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
                cp = UTF16.charAt(pattern, i);
                // if we are in a quote, then handle it.
                switch (quoteStatus) {
                case SLASH_START:
                    switch (cp) {
                    case 'u':
                        quoteStatus = HEX;
                        hexCount = 4;
                        hexValue = 0;
                        continue main;
                    case 'U':
                        quoteStatus = HEX;
                        hexCount = 8;
                        hexValue = 0;
                        continue main;
                    default:
                        if (usingSlash) {
                            UTF16.append(buffer, cp);
                            quoteStatus = NONE;
                            continue main;
                        } else {
                            buffer.append(BACK_SLASH);
                            quoteStatus = NONE;
                        }
                    }
                    break; // fall through to NONE
                case HEX:
                    hexValue <<= 4;
                    hexValue += cp;
                    switch (cp) {
                    case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
                        hexValue -= '0'; break;
                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                        hexValue -= 'a' - 10; break;
                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                        hexValue -= 'A' - 10; break;
                    default:
                        start = i;
                    return BROKEN_ESCAPE;
                    }
                    --hexCount;
                    if (hexCount == 0) {
                        quoteStatus = NONE;
                        UTF16.append(buffer, hexValue);
                    }
                    continue main;
                case AFTER_QUOTE:
                    // see if we get another quote character
                    // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
                    if (cp == lastQuote) {
                        UTF16.append(buffer, cp);
                        quoteStatus = NORMAL_QUOTE;
                        continue main;
                    }
                    quoteStatus = NONE;
                    break; // fall through to NONE
                case START_QUOTE:
                    // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
                    if (cp == lastQuote) {
                        UTF16.append(buffer, cp);
                        quoteStatus = NONE; // get out of quote, with no trace remaining
                        continue;
                    }
                    // otherwise get into quote
                    UTF16.append(buffer, cp);
                    quoteStatus = NORMAL_QUOTE;
                    continue main;
                case NORMAL_QUOTE:
                    if (cp == lastQuote) {
                        quoteStatus = AFTER_QUOTE; // get out of quote
                        continue main;
                    }
                    UTF16.append(buffer, cp);
                    continue main;
                }

                if (ignorableCharacters.contains(cp)) {
                    continue;
                }
                // do syntax characters
                if (syntaxCharacters.contains(cp)) {
                    if (status == UNKNOWN) {
                        UTF16.append(buffer, cp);
                        start = i + UTF16.getCharCount(cp);
                        return SYNTAX;
                    } else { // LITERAL, so back up and break
                        start = i;
                        return status;
                    }
                }
                // otherwise it is a literal; keep on going
                status = LITERAL;
                if (cp == BACK_SLASH) {
                    quoteStatus = SLASH_START;
                    continue;
                } else if (usingQuote && cp == SINGLE_QUOTE) {
                    lastQuote = cp;
                    quoteStatus = START_QUOTE;
                    continue;
                }
                // normal literals
                UTF16.append(buffer, cp);
            }
        // handle final cleanup
        start = limit;
        switch (quoteStatus) {
        case HEX:
            status = BROKEN_ESCAPE;
            break;
        case SLASH_START:
            if (usingSlash) {
                status = BROKEN_ESCAPE;
            } else {
                buffer.append(BACK_SLASH);
            }
            break;
        case START_QUOTE: case NORMAL_QUOTE:
            status = BROKEN_QUOTE;
            break;
        }
        return status;
    }


}
//eof