12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 2006-2009, Google, International Business Machines Corporation * 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and others. All Rights Reserved. * 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UTF16; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax. 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The '' (two quotes) is treated as a single quote, inside or outside a quote 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <ul> 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Any ignorable characters are ignored in parsing.</li> 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Any syntax characters are broken into separate tokens</li> 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Quote characters can be specified: '...', "...", and \x </li> 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>Other characters are treated as literals</li> 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </ul> 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic class PatternTokenizer { 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // settings used in the interpretation of the pattern 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet ignorableCharacters = new UnicodeSet(); 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet syntaxCharacters = new UnicodeSet(); 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet extraQuotingCharacters = new UnicodeSet(); 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet escapeCharacters = new UnicodeSet(); 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean usingSlash = false; 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean usingQuote = false; 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // transient data, set when needed. Null it out for any changes in the above fields. 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private transient UnicodeSet needingQuoteCharacters = null; 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // data about the current pattern being parsed. start gets moved as we go along. 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int start; 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int limit; 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String pattern; 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getIgnorableCharacters() { 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) ignorableCharacters.clone(); 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param ignorableCharacters Characters to be ignored. 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as ignored characters. 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getSyntaxCharacters() { 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) syntaxCharacters.clone(); 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getExtraQuotingCharacters() { 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) extraQuotingCharacters.clone(); 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param syntaxCharacters Characters to be set as syntax characters. 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as syntax characters. 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the extra characters to be quoted in literals 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param syntaxCharacters Characters to be set as extra quoting characters. 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as extra quoting characters. 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet getEscapeCharacters() { 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) escapeCharacters.clone(); 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param escapeCharacters Characters to be set as escape characters. 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A PatternTokenizer object in which characters are specified as escape characters. 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isUsingQuote() { 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return usingQuote; 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setUsingQuote(boolean usingQuote) { 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.usingQuote = usingQuote; 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isUsingSlash() { 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return usingSlash; 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setUsingSlash(boolean usingSlash) { 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.usingSlash = usingSlash; 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = null; 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // public UnicodeSet getQuoteCharacters() { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// return (UnicodeSet) quoteCharacters.clone(); 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// } 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// needingQuoteCharacters = null; 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// return this; 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// } 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getLimit() { 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return limit; 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setLimit(int limit) { 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.limit = limit; 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getStart() { 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return start; 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setStart(int start) { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.start = start; 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setPattern(CharSequence pattern) { 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return setPattern(pattern.toString()); 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public PatternTokenizer setPattern(String pattern) { 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (pattern == null) { 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("Inconsistent arguments"); 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.start = 0; 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.limit = pattern.length(); 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.pattern = pattern; 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final char SINGLE_QUOTE = '\''; 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final char BACK_SLASH = '\\'; 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int NO_QUOTE = -1, IN_QUOTE = -2; 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String quoteLiteral(CharSequence string) { 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return quoteLiteral(string.toString()); 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param string String passed to quote a literal string. 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String quoteLiteral(String string) { 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (needingQuoteCharacters == null) { 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer result = new StringBuffer(); 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int quotedChar = NO_QUOTE; 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(string, i); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (escapeCharacters.contains(cp)) { 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we may have to fix up previous characters 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quotedChar = NO_QUOTE; 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendEscaped(result, cp); 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (needingQuoteCharacters.contains(cp)) { 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we have already started a quote 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingQuote && cp == SINGLE_QUOTE) { // double it 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise not already in quote 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) { 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(BACK_SLASH); 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingQuote) { 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == SINGLE_QUOTE) { // double it and continue 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quotedChar = IN_QUOTE; 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we have no choice but to use \\u or \\U 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendEscaped(result, cp); 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise cp doesn't need quoting 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we may have to fix up previous characters 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quotedChar = NO_QUOTE; 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(result, cp); 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // all done. 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we may have to fix up previous characters 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (quotedChar == IN_QUOTE) { 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(SINGLE_QUOTE); 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result.toString(); 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void appendEscaped(StringBuffer result, int cp) { 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp <= 0xFFFF) { 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append("\\u").append(Utility.hex(cp,4)); 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append("\\U").append(Utility.hex(cp,8)); 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String normalize() { 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldStart = start; 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer result = new StringBuffer(); 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer buffer = new StringBuffer(); 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (true) { 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.setLength(0); 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int status = next(buffer); 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (status == DONE) { 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = oldStart; 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result.toString(); 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (status != SYNTAX) { 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(quoteLiteral(buffer)); 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.append(buffer); 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int next(StringBuffer buffer) { 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (start >= limit) return DONE; 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int status = UNKNOWN; 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastQuote = UNKNOWN; 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int quoteStatus = NONE; 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int hexCount = 0; 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int hexValue = 0; 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert main: 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(pattern, i); 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we are in a quote, then handle it. 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (quoteStatus) { 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case SLASH_START: 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (cp) { 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'u': 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = HEX; 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexCount = 4; 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue = 0; 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'U': 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = HEX; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexCount = 8; 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue = 0; 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) { 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(BACK_SLASH); 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // fall through to NONE 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case HEX: 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue <<= 4; 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue += cp; 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (cp) { 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue -= '0'; break; 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue -= 'a' - 10; break; 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hexValue -= 'A' - 10; break; 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i; 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return BROKEN_ESCAPE; 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --hexCount; 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (hexCount == 0) { 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, hexValue); 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case AFTER_QUOTE: 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // see if we get another quote character 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == lastQuote) { 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NORMAL_QUOTE; 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // fall through to NONE 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case START_QUOTE: 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == lastQuote) { 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NONE; // get out of quote, with no trace remaining 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise get into quote 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = NORMAL_QUOTE; 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case NORMAL_QUOTE: 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == lastQuote) { 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = AFTER_QUOTE; // get out of quote 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue main; 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ignorableCharacters.contains(cp)) { 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // do syntax characters 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (syntaxCharacters.contains(cp)) { 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (status == UNKNOWN) { 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i + UTF16.getCharCount(cp); 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return SYNTAX; 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // LITERAL, so back up and break 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i; 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return status; 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise it is a literal; keep on going 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = LITERAL; 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == BACK_SLASH) { 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = SLASH_START; 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (usingQuote && cp == SINGLE_QUOTE) { 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastQuote = cp; 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert quoteStatus = START_QUOTE; 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // normal literals 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buffer, cp); 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handle final cleanup 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = limit; 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (quoteStatus) { 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case HEX: 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = BROKEN_ESCAPE; 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case SLASH_START: 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (usingSlash) { 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = BROKEN_ESCAPE; 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(BACK_SLASH); 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case START_QUOTE: case NORMAL_QUOTE: 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert status = BROKEN_QUOTE; 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return status; 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//eof 393