17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 2006-2009, Google, International Business Machines Corporation * 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and others. All Rights Reserved. * 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl; 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UTF16; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax. 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The '' (two quotes) is treated as a single quote, inside or outside a quote 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <ul> 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Any ignorable characters are ignored in parsing.</li> 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Any syntax characters are broken into separate tokens</li> 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Quote characters can be specified: '...', "...", and \x </li> 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Other characters are treated as literals</li> 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </ul> 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic class PatternTokenizer { 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // settings used in the interpretation of the pattern 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet ignorableCharacters = new UnicodeSet(); 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet syntaxCharacters = new UnicodeSet(); 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet extraQuotingCharacters = new UnicodeSet(); 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet escapeCharacters = new UnicodeSet(); 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean usingSlash = false; 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean usingQuote = false; 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // transient data, set when needed. Null it out for any changes in the above fields. 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private transient UnicodeSet needingQuoteCharacters = null; 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // data about the current pattern being parsed. start gets moved as we go along. 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int start; 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int limit; 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String pattern; 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getIgnorableCharacters() { 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) ignorableCharacters.clone(); 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param ignorableCharacters Characters to be ignored. 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as ignored characters. 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getSyntaxCharacters() { 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) syntaxCharacters.clone(); 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getExtraQuotingCharacters() { 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) extraQuotingCharacters.clone(); 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param syntaxCharacters Characters to be set as syntax characters. 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as syntax characters. 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the extra characters to be quoted in literals 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param syntaxCharacters Characters to be set as extra quoting characters. 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as extra quoting characters. 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getEscapeCharacters() { 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) escapeCharacters.clone(); 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param escapeCharacters Characters to be set as escape characters. 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as escape characters. 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isUsingQuote() { 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return usingQuote; 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setUsingQuote(boolean usingQuote) { 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.usingQuote = usingQuote; 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isUsingSlash() { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return usingSlash; 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setUsingSlash(boolean usingSlash) { 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.usingSlash = usingSlash; 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // public UnicodeSet getQuoteCharacters() { 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// return (UnicodeSet) quoteCharacters.clone(); 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// } 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// needingQuoteCharacters = null; 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// return this; 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// } 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getLimit() { 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return limit; 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setLimit(int limit) { 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.limit = limit; 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getStart() { 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return start; 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setStart(int start) { 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.start = start; 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setPattern(CharSequence pattern) { 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return setPattern(pattern.toString()); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setPattern(String pattern) { 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern == null) { 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("Inconsistent arguments"); 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.start = 0; 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.limit = pattern.length(); 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.pattern = pattern; 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final char SINGLE_QUOTE = '\''; 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final char BACK_SLASH = '\\'; 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int NO_QUOTE = -1, IN_QUOTE = -2; 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String quoteLiteral(CharSequence string) { 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return quoteLiteral(string.toString()); 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param string String passed to quote a literal string. 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String quoteLiteral(String string) { 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (needingQuoteCharacters == null) { 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer result = new StringBuffer(); 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int quotedChar = NO_QUOTE; 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(string, i); 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (escapeCharacters.contains(cp)) { 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we may have to fix up previous characters 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quotedChar = NO_QUOTE; 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendEscaped(result, cp); 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (needingQuoteCharacters.contains(cp)) { 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we have already started a quote 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingQuote && cp == SINGLE_QUOTE) { // double it 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise not already in quote 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) { 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(BACK_SLASH); 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingQuote) { 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == SINGLE_QUOTE) { // double it and continue 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quotedChar = IN_QUOTE; 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we have no choice but to use \\u or \\U 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendEscaped(result, cp); 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise cp doesn't need quoting 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we may have to fix up previous characters 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quotedChar = NO_QUOTE; 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // all done. 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we may have to fix up previous characters 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result.toString(); 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void appendEscaped(StringBuffer result, int cp) { 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp <= 0xFFFF) { 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append("\\u").append(Utility.hex(cp,4)); 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append("\\U").append(Utility.hex(cp,8)); 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String normalize() { 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldStart = start; 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer result = new StringBuffer(); 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer buffer = new StringBuffer(); 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (true) { 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.setLength(0); 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int status = next(buffer); 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (status == DONE) { 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = oldStart; 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result.toString(); 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (status != SYNTAX) { 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(quoteLiteral(buffer)); 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(buffer); 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int next(StringBuffer buffer) { 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (start >= limit) return DONE; 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int status = UNKNOWN; 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastQuote = UNKNOWN; 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int quoteStatus = NONE; 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int hexCount = 0; 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int hexValue = 0; 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert main: 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(pattern, i); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we are in a quote, then handle it. 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (quoteStatus) { 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case SLASH_START: 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (cp) { 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'u': 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = HEX; 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexCount = 4; 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue = 0; 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'U': 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = HEX; 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexCount = 8; 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue = 0; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) { 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(BACK_SLASH); 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // fall through to NONE 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case HEX: 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue <<= 4; 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue += cp; 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (cp) { 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue -= '0'; break; 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue -= 'a' - 10; break; 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue -= 'A' - 10; break; 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i; 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return BROKEN_ESCAPE; 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --hexCount; 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (hexCount == 0) { 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, hexValue); 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case AFTER_QUOTE: 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // see if we get another quote character 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == lastQuote) { 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NORMAL_QUOTE; 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // fall through to NONE 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case START_QUOTE: 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == lastQuote) { 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; // get out of quote, with no trace remaining 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise get into quote 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NORMAL_QUOTE; 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case NORMAL_QUOTE: 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == lastQuote) { 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = AFTER_QUOTE; // get out of quote 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ignorableCharacters.contains(cp)) { 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // do syntax characters 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (syntaxCharacters.contains(cp)) { 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (status == UNKNOWN) { 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i + UTF16.getCharCount(cp); 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return SYNTAX; 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // LITERAL, so back up and break 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i; 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return status; 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise it is a literal; keep on going 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = LITERAL; 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == BACK_SLASH) { 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = SLASH_START; 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (usingQuote && cp == SINGLE_QUOTE) { 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastQuote = cp; 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = START_QUOTE; 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // normal literals 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handle final cleanup 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = limit; 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (quoteStatus) { 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case HEX: 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = BROKEN_ESCAPE; 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case SLASH_START: 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) { 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = BROKEN_ESCAPE; 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(BACK_SLASH); 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case START_QUOTE: case NORMAL_QUOTE: 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = BROKEN_QUOTE; 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return status; 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//eof 391