1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4********************************************************************** 5* Copyright (c) 2001-2011, International Business Machines 6* Corporation and others. All Rights Reserved. 7********************************************************************** 8* Date Name Description 9* 11/19/2001 aliu Creation. 10********************************************************************** 11*/ 12package com.ibm.icu.text; 13import com.ibm.icu.impl.Utility; 14 15/** 16 * A transliterator that converts Unicode characters to an escape 17 * form. Examples of escape forms are "U+4E01" and "". 18 * Escape forms have a prefix and suffix, either of which may be 19 * empty, a radix, typically 16 or 10, a minimum digit count, 20 * typically 1, 4, or 8, and a boolean that specifies whether 21 * supplemental characters are handled as 32-bit code points or as two 22 * 16-bit code units. Most escape forms handle 32-bit code points, 23 * but some, such as the Java form, intentionally break them into two 24 * surrogate pairs, for backward compatibility. 25 * 26 * <p>Some escape forms actually have two different patterns, one for 27 * BMP characters (0..FFFF) and one for supplements (>FFFF). To 28 * handle this, a second EscapeTransliterator may be defined that 29 * specifies the pattern to be produced for supplementals. An example 30 * of a form that requires this is the C form, which uses "\\uFFFF" 31 * for BMP characters and "\\U0010FFFF" for supplementals. 32 * 33 * <p>This class is package private. It registers several standard 34 * variants with the system which are then accessed via their IDs. 35 * 36 * @author Alan Liu 37 */ 38class EscapeTransliterator extends Transliterator { 39 40 /** 41 * The prefix of the escape form; may be empty, but usually isn't. 42 * May not be null. 43 */ 44 private String prefix; 45 46 /** 47 * The prefix of the escape form; often empty. May not be null. 48 */ 49 private String suffix; 50 51 /** 52 * The radix to display the number in. Typically 16 or 10. Must 53 * be in the range 2 to 36. 54 */ 55 private int radix; 56 57 /** 58 * The minimum number of digits. Typically 1, 4, or 8. Values 59 * less than 1 are equivalent to 1. 60 */ 61 private int minDigits; 62 63 /** 64 * If true, supplementals are handled as 32-bit code points. If 65 * false, they are handled as two 16-bit code units. 66 */ 67 private boolean grokSupplementals; 68 69 /** 70 * The form to be used for supplementals. If this is null then 71 * the same form is used for BMP characters and supplementals. If 72 * this is not null and if grokSupplementals is true then the 73 * prefix, suffix, radix, and minDigits of this object are used 74 * for supplementals. 75 */ 76 private EscapeTransliterator supplementalHandler; 77 78 /** 79 * Registers standard variants with the system. Called by 80 * Transliterator during initialization. 81 */ 82 static void register() { 83 // Unicode: "U+10FFFF" hex, min=4, max=6 84 Transliterator.registerFactory("Any-Hex/Unicode", new Transliterator.Factory() { 85 @Override 86 public Transliterator getInstance(String ID) { 87 return new EscapeTransliterator("Any-Hex/Unicode", 88 "U+", "", 16, 4, true, null); 89 } 90 }); 91 92 // Java: "\\uFFFF" hex, min=4, max=4 93 Transliterator.registerFactory("Any-Hex/Java", new Transliterator.Factory() { 94 @Override 95 public Transliterator getInstance(String ID) { 96 return new EscapeTransliterator("Any-Hex/Java", 97 "\\u", "", 16, 4, false, null); 98 } 99 }); 100 101 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 102 Transliterator.registerFactory("Any-Hex/C", new Transliterator.Factory() { 103 @Override 104 public Transliterator getInstance(String ID) { 105 return new EscapeTransliterator("Any-Hex/C", 106 "\\u", "", 16, 4, true, 107 new EscapeTransliterator("", "\\U", "", 16, 8, true, null)); 108 } 109 }); 110 111 // XML: "" hex, min=1, max=6 112 Transliterator.registerFactory("Any-Hex/XML", new Transliterator.Factory() { 113 @Override 114 public Transliterator getInstance(String ID) { 115 return new EscapeTransliterator("Any-Hex/XML", 116 "&#x", ";", 16, 1, true, null); 117 } 118 }); 119 120 // XML10: "&1114111;" dec, min=1, max=7 (not really "Any-Hex") 121 Transliterator.registerFactory("Any-Hex/XML10", new Transliterator.Factory() { 122 @Override 123 public Transliterator getInstance(String ID) { 124 return new EscapeTransliterator("Any-Hex/XML10", 125 "&#", ";", 10, 1, true, null); 126 } 127 }); 128 129 // Perl: "\\x{263A}" hex, min=1, max=6 130 Transliterator.registerFactory("Any-Hex/Perl", new Transliterator.Factory() { 131 @Override 132 public Transliterator getInstance(String ID) { 133 return new EscapeTransliterator("Any-Hex/Perl", 134 "\\x{", "}", 16, 1, true, null); 135 } 136 }); 137 138 // Plain: "FFFF" hex, min=4, max=6 139 Transliterator.registerFactory("Any-Hex/Plain", new Transliterator.Factory() { 140 @Override 141 public Transliterator getInstance(String ID) { 142 return new EscapeTransliterator("Any-Hex/Plain", 143 "", "", 16, 4, true, null); 144 } 145 }); 146 147 // Generic 148 Transliterator.registerFactory("Any-Hex", new Transliterator.Factory() { 149 @Override 150 public Transliterator getInstance(String ID) { 151 return new EscapeTransliterator("Any-Hex", 152 "\\u", "", 16, 4, false, null); 153 } 154 }); 155 } 156 157 /** 158 * Constructs an escape transliterator with the given ID and 159 * parameters. See the class member documentation for details. 160 */ 161 EscapeTransliterator(String ID, String prefix, String suffix, 162 int radix, int minDigits, 163 boolean grokSupplementals, 164 EscapeTransliterator supplementalHandler) { 165 super(ID, null); 166 this.prefix = prefix; 167 this.suffix = suffix; 168 this.radix = radix; 169 this.minDigits = minDigits; 170 this.grokSupplementals = grokSupplementals; 171 this.supplementalHandler = supplementalHandler; 172 } 173 174 /** 175 * Implements {@link Transliterator#handleTransliterate}. 176 */ 177 @Override 178 protected void handleTransliterate(Replaceable text, 179 Position pos, boolean incremental) { 180 int start = pos.start; 181 int limit = pos.limit; 182 183 StringBuilder buf = new StringBuilder(prefix); 184 int prefixLen = prefix.length(); 185 boolean redoPrefix = false; 186 187 while (start < limit) { 188 int c = grokSupplementals ? text.char32At(start) : text.charAt(start); 189 int charLen = grokSupplementals ? UTF16.getCharCount(c) : 1; 190 191 if ((c & 0xFFFF0000) != 0 && supplementalHandler != null) { 192 buf.setLength(0); 193 buf.append(supplementalHandler.prefix); 194 Utility.appendNumber(buf, c, supplementalHandler.radix, 195 supplementalHandler.minDigits); 196 buf.append(supplementalHandler.suffix); 197 redoPrefix = true; 198 } else { 199 if (redoPrefix) { 200 buf.setLength(0); 201 buf.append(prefix); 202 redoPrefix = false; 203 } else { 204 buf.setLength(prefixLen); 205 } 206 Utility.appendNumber(buf, c, radix, minDigits); 207 buf.append(suffix); 208 } 209 210 text.replace(start, start + charLen, buf.toString()); 211 start += buf.length(); 212 limit += buf.length() - charLen; 213 } 214 215 pos.contextLimit += limit - pos.limit; 216 pos.limit = limit; 217 pos.start = start; 218 } 219 220 /* (non-Javadoc) 221 * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) 222 */ 223 @Override 224 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { 225 sourceSet.addAll(getFilterAsUnicodeSet(inputFilter)); 226 for (EscapeTransliterator it = this; it != null ; it = it.supplementalHandler) { 227 if (inputFilter.size() != 0) { 228 targetSet.addAll(it.prefix); 229 targetSet.addAll(it.suffix); 230 StringBuilder buffer = new StringBuilder(); 231 for (int i = 0; i < it.radix; ++i) { 232 Utility.appendNumber(buffer, i, it.radix, it.minDigits); 233 } 234 targetSet.addAll(buffer.toString()); // TODO drop once String is changed to CharSequence in UnicodeSet 235 } 236 } 237 } 238} 239