CharsetDetector.java revision bfab1e7fec36dff93fb980c546ad64a565faf9fc
1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/** 3******************************************************************************* 4* Copyright (C) 2005-2016, International Business Machines Corporation and * 5* others. All Rights Reserved. * 6******************************************************************************* 7*/ 8package android.icu.text; 9 10import java.io.IOException; 11import java.io.InputStream; 12import java.io.Reader; 13import java.util.ArrayList; 14import java.util.Arrays; 15import java.util.Collections; 16import java.util.List; 17 18 19/** 20 * <code>CharsetDetector</code> provides a facility for detecting the 21 * charset or encoding of character data in an unknown format. 22 * The input data can either be from an input stream or an array of bytes. 23 * The result of the detection operation is a list of possibly matching 24 * charsets, or, for simple use, you can just ask for a Java Reader that 25 * will will work over the input data. 26 * <p> 27 * Character set detection is at best an imprecise operation. The detection 28 * process will attempt to identify the charset that best matches the characteristics 29 * of the byte data, but the process is partly statistical in nature, and 30 * the results can not be guaranteed to always be correct. 31 * <p> 32 * For best accuracy in charset detection, the input data should be primarily 33 * in a single language, and a minimum of a few hundred bytes worth of plain text 34 * in the language are needed. The detection process will attempt to 35 * ignore html or xml style markup that could otherwise obscure the content. 36 * <p> 37 * @hide Only a subset of ICU is exposed in Android 38 */ 39public class CharsetDetector { 40 41// Question: Should we have getters corresponding to the setters for input text 42// and declared encoding? 43 44// A thought: If we were to create our own type of Java Reader, we could defer 45// figuring out an actual charset for data that starts out with too much English 46// only ASCII until the user actually read through to something that didn't look 47// like 7 bit English. If nothing else ever appeared, we would never need to 48// actually choose the "real" charset. All assuming that the application just 49// wants the data, and doesn't care about a char set name. 50 51 /** 52 * Constructor 53 */ 54 public CharsetDetector() { 55 } 56 57 /** 58 * Set the declared encoding for charset detection. 59 * The declared encoding of an input text is an encoding obtained 60 * from an http header or xml declaration or similar source that 61 * can be provided as additional information to the charset detector. 62 * A match between a declared encoding and a possible detected encoding 63 * will raise the quality of that detected encoding by a small delta, 64 * and will also appear as a "reason" for the match. 65 * <p> 66 * A declared encoding that is incompatible with the input data being 67 * analyzed will not be added to the list of possible encodings. 68 * 69 * @param encoding The declared encoding 70 */ 71 public CharsetDetector setDeclaredEncoding(String encoding) { 72 fDeclaredEncoding = encoding; 73 return this; 74 } 75 76 /** 77 * Set the input text (byte) data whose charset is to be detected. 78 * 79 * @param in the input text of unknown encoding 80 * 81 * @return This CharsetDetector 82 */ 83 public CharsetDetector setText(byte [] in) { 84 fRawInput = in; 85 fRawLength = in.length; 86 87 return this; 88 } 89 90 private static final int kBufSize = 8000; 91 92 /** 93 * Set the input text (byte) data whose charset is to be detected. 94 * <p> 95 * The input stream that supplies the character data must have markSupported() 96 * == true; the charset detection process will read a small amount of data, 97 * then return the stream to its original position via 98 * the InputStream.reset() operation. The exact amount that will 99 * be read depends on the characteristics of the data itself. 100 * 101 * @param in the input text of unknown encoding 102 * 103 * @return This CharsetDetector 104 */ 105 106 public CharsetDetector setText(InputStream in) throws IOException { 107 fInputStream = in; 108 fInputStream.mark(kBufSize); 109 fRawInput = new byte[kBufSize]; // Always make a new buffer because the 110 // previous one may have come from the caller, 111 // in which case we can't touch it. 112 fRawLength = 0; 113 int remainingLength = kBufSize; 114 while (remainingLength > 0 ) { 115 // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. 116 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); 117 if (bytesRead <= 0) { 118 break; 119 } 120 fRawLength += bytesRead; 121 remainingLength -= bytesRead; 122 } 123 fInputStream.reset(); 124 125 return this; 126 } 127 128 129 /** 130 * Return the charset that best matches the supplied input data. 131 * 132 * Note though, that because the detection 133 * only looks at the start of the input data, 134 * there is a possibility that the returned charset will fail to handle 135 * the full set of input data. 136 * <p> 137 * Raise an exception if 138 * <ul> 139 * <li>no charset appears to match the data.</li> 140 * <li>no input text has been provided</li> 141 * </ul> 142 * 143 * @return a CharsetMatch object representing the best matching charset, or 144 * <code>null</code> if there are no matches. 145 */ 146 public CharsetMatch detect() { 147// TODO: A better implementation would be to copy the detect loop from 148// detectAll(), and cut it short as soon as a match with a high confidence 149// is found. This is something to be done later, after things are otherwise 150// working. 151 CharsetMatch matches[] = detectAll(); 152 153 if (matches == null || matches.length == 0) { 154 return null; 155 } 156 157 return matches[0]; 158 } 159 160 /** 161 * Return an array of all charsets that appear to be plausible 162 * matches with the input data. The array is ordered with the 163 * best quality match first. 164 * <p> 165 * Raise an exception if 166 * <ul> 167 * <li>no charsets appear to match the input data.</li> 168 * <li>no input text has been provided</li> 169 * </ul> 170 * 171 * @return An array of CharsetMatch objects representing possibly matching charsets. 172 */ 173 public CharsetMatch[] detectAll() { 174 ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); 175 176 MungeInput(); // Strip html markup, collect byte stats. 177 178 // Iterate over all possible charsets, remember all that 179 // give a match quality > 0. 180 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 181 CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 182 boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; 183 if (active) { 184 CharsetMatch m = rcinfo.recognizer.match(this); 185 if (m != null) { 186 matches.add(m); 187 } 188 } 189 } 190 Collections.sort(matches); // CharsetMatch compares on confidence 191 Collections.reverse(matches); // Put best match first. 192 CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; 193 resultArray = matches.toArray(resultArray); 194 return resultArray; 195 } 196 197 198 /** 199 * Autodetect the charset of an inputStream, and return a Java Reader 200 * to access the converted input data. 201 * <p> 202 * This is a convenience method that is equivalent to 203 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code> 204 * <p> 205 * For the input stream that supplies the character data, markSupported() 206 * must be true; the charset detection will read a small amount of data, 207 * then return the stream to its original position via 208 * the InputStream.reset() operation. The exact amount that will 209 * be read depends on the characteristics of the data itself. 210 *<p> 211 * Raise an exception if no charsets appear to match the input data. 212 * 213 * @param in The source of the byte data in the unknown charset. 214 * 215 * @param declaredEncoding A declared encoding for the data, if available, 216 * or null or an empty string if none is available. 217 */ 218 public Reader getReader(InputStream in, String declaredEncoding) { 219 fDeclaredEncoding = declaredEncoding; 220 221 try { 222 setText(in); 223 224 CharsetMatch match = detect(); 225 226 if (match == null) { 227 return null; 228 } 229 230 return match.getReader(); 231 } catch (IOException e) { 232 return null; 233 } 234 } 235 236 /** 237 * Autodetect the charset of an inputStream, and return a String 238 * containing the converted input data. 239 * <p> 240 * This is a convenience method that is equivalent to 241 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code> 242 *<p> 243 * Raise an exception if no charsets appear to match the input data. 244 * 245 * @param in The source of the byte data in the unknown charset. 246 * 247 * @param declaredEncoding A declared encoding for the data, if available, 248 * or null or an empty string if none is available. 249 */ 250 public String getString(byte[] in, String declaredEncoding) 251 { 252 fDeclaredEncoding = declaredEncoding; 253 254 try { 255 setText(in); 256 257 CharsetMatch match = detect(); 258 259 if (match == null) { 260 return null; 261 } 262 263 return match.getString(-1); 264 } catch (IOException e) { 265 return null; 266 } 267 } 268 269 270 /** 271 * Get the names of all charsets supported by <code>CharsetDetector</code> class. 272 * <p> 273 * <b>Note:</b> Multiple different charset encodings in a same family may use 274 * a single shared name in this implementation. For example, this method returns 275 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 276 * (Windows Latin 1). However, actual detection result could be "windows-1252" 277 * when the input data matches Latin 1 code points with any points only available 278 * in "windows-1252". 279 * 280 * @return an array of the names of all charsets supported by 281 * <code>CharsetDetector</code> class. 282 */ 283 public static String[] getAllDetectableCharsets() { 284 String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()]; 285 for (int i = 0; i < allCharsetNames.length; i++) { 286 allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName(); 287 } 288 return allCharsetNames; 289 } 290 291 /** 292 * Test whether or not input filtering is enabled. 293 * 294 * @return <code>true</code> if input text will be filtered. 295 * 296 * @see #enableInputFilter 297 */ 298 public boolean inputFilterEnabled() 299 { 300 return fStripTags; 301 } 302 303 /** 304 * Enable filtering of input text. If filtering is enabled, 305 * text within angle brackets ("<" and ">") will be removed 306 * before detection. 307 * 308 * @param filter <code>true</code> to enable input text filtering. 309 * 310 * @return The previous setting. 311 */ 312 public boolean enableInputFilter(boolean filter) 313 { 314 boolean previous = fStripTags; 315 316 fStripTags = filter; 317 318 return previous; 319 } 320 321 /* 322 * MungeInput - after getting a set of raw input data to be analyzed, preprocess 323 * it by removing what appears to be html markup. 324 */ 325 private void MungeInput() { 326 int srci = 0; 327 int dsti = 0; 328 byte b; 329 boolean inMarkup = false; 330 int openTags = 0; 331 int badTags = 0; 332 333 // 334 // html / xml markup stripping. 335 // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 336 // discard everything within < brackets > 337 // Count how many total '<' and illegal (nested) '<' occur, so we can make some 338 // guess as to whether the input was actually marked up at all. 339 if (fStripTags) { 340 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { 341 b = fRawInput[srci]; 342 if (b == (byte)'<') { 343 if (inMarkup) { 344 badTags++; 345 } 346 inMarkup = true; 347 openTags++; 348 } 349 350 if (! inMarkup) { 351 fInputBytes[dsti++] = b; 352 } 353 354 if (b == (byte)'>') { 355 inMarkup = false; 356 } 357 } 358 359 fInputLen = dsti; 360 } 361 362 // 363 // If it looks like this input wasn't marked up, or if it looks like it's 364 // essentially nothing but markup abandon the markup stripping. 365 // Detection will have to work on the unstripped input. 366 // 367 if (openTags<5 || openTags/5 < badTags || 368 (fInputLen < 100 && fRawLength>600)) { 369 int limit = fRawLength; 370 371 if (limit > kBufSize) { 372 limit = kBufSize; 373 } 374 375 for (srci=0; srci<limit; srci++) { 376 fInputBytes[srci] = fRawInput[srci]; 377 } 378 fInputLen = srci; 379 } 380 381 // 382 // Tally up the byte occurence statistics. 383 // These are available for use by the various detectors. 384 // 385 Arrays.fill(fByteStats, (short)0); 386 for (srci=0; srci<fInputLen; srci++) { 387 int val = fInputBytes[srci] & 0x00ff; 388 fByteStats[val]++; 389 } 390 391 fC1Bytes = false; 392 for (int i = 0x80; i <= 0x9F; i += 1) { 393 if (fByteStats[i] != 0) { 394 fC1Bytes = true; 395 break; 396 } 397 } 398 } 399 400 /* 401 * The following items are accessed by individual CharsetRecongizers during 402 * the recognition process 403 * 404 */ 405 byte[] fInputBytes = // The text to be checked. Markup will have been 406 new byte[kBufSize]; // removed if appropriate. 407 408 int fInputLen; // Length of the byte data in fInputBytes. 409 410 short fByteStats[] = // byte frequency statistics for the input text. 411 new short[256]; // Value is percent, not absolute. 412 // Value is rounded up, so zero really means zero occurences. 413 414 boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input; 415 false; 416 417 String fDeclaredEncoding; 418 419 420 byte[] fRawInput; // Original, untouched input bytes. 421 // If user gave us a byte array, this is it. 422 // If user gave us a stream, it's read to a 423 // buffer here. 424 int fRawLength; // Length of data in fRawInput array. 425 426 InputStream fInputStream; // User's input stream, or null if the user 427 // gave us a byte array. 428 429 // 430 // Stuff private to CharsetDetector 431 // 432 private boolean fStripTags = // If true, setText() will strip tags from input text. 433 false; 434 435 private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had 436 // been changed from the default. The array index is 437 // corresponding to ALL_RECOGNIZER. See setDetectableCharset(). 438 439 private static class CSRecognizerInfo { 440 CharsetRecognizer recognizer; 441 boolean isDefaultEnabled; 442 443 CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) { 444 this.recognizer = recognizer; 445 this.isDefaultEnabled = isDefaultEnabled; 446 } 447 } 448 449 /* 450 * List of recognizers for all charsets known to the implementation. 451 */ 452 private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS; 453 454 static { 455 List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>(); 456 457 list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true)); 458 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true)); 459 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true)); 460 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true)); 461 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true)); 462 463 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true)); 464 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true)); 465 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true)); 466 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true)); 467 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true)); 468 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true)); 469 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true)); 470 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true)); 471 472 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true)); 473 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true)); 474 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true)); 475 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true)); 476 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true)); 477 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true)); 478 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true)); 479 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true)); 480 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true)); 481 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true)); 482 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true)); 483 484 // IBM 420/424 recognizers are disabled by default 485 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); 486 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); 487 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); 488 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); 489 490 ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); 491 } 492 493 /** 494 * Get the names of charsets that can be recognized by this CharsetDetector instance. 495 * 496 * @return an array of the names of charsets that can be recognized by this CharsetDetector 497 * instance. 498 * 499 * @deprecated This API is ICU internal only. 500 * @hide draft / provisional / internal are hidden on Android 501 */ 502 @Deprecated 503 public String[] getDetectableCharsets() { 504 List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size()); 505 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 506 CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 507 boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i]; 508 if (active) { 509 csnames.add(rcinfo.recognizer.getName()); 510 } 511 } 512 return csnames.toArray(new String[csnames.size()]); 513 } 514 515 /** 516 * Enable or disable individual charset encoding. 517 * A name of charset encoding must be included in the names returned by 518 * {@link #getAllDetectableCharsets()}. 519 * 520 * @param encoding the name of charset encoding. 521 * @param enabled <code>true</code> to enable, or <code>false</code> to disable the 522 * charset encoding. 523 * @return A reference to this <code>CharsetDetector</code>. 524 * @throws IllegalArgumentException when the name of charset encoding is 525 * not supported. 526 * 527 * @deprecated This API is ICU internal only. 528 * @hide draft / provisional / internal are hidden on Android 529 */ 530 @Deprecated 531 public CharsetDetector setDetectableCharset(String encoding, boolean enabled) { 532 int modIdx = -1; 533 boolean isDefaultVal = false; 534 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 535 CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i); 536 if (csrinfo.recognizer.getName().equals(encoding)) { 537 modIdx = i; 538 isDefaultVal = (csrinfo.isDefaultEnabled == enabled); 539 break; 540 } 541 } 542 if (modIdx < 0) { 543 // No matching encoding found 544 throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\""); 545 } 546 547 if (fEnabledRecognizers == null && !isDefaultVal) { 548 // Create an array storing the non default setting 549 fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()]; 550 551 // Initialize the array with default info 552 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 553 fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled; 554 } 555 } 556 557 if (fEnabledRecognizers != null) { 558 fEnabledRecognizers[modIdx] = enabled; 559 } 560 561 return this; 562 } 563} 564