CharsetDetector.java revision 1537b2f39245c07b00aa78c3600f7aebcb172490
1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/** 3******************************************************************************* 4* Copyright (C) 2005-2014, International Business Machines Corporation and * 5* others. All Rights Reserved. * 6******************************************************************************* 7*/ 8package android.icu.text; 9 10import java.io.IOException; 11import java.io.InputStream; 12import java.io.Reader; 13import java.util.ArrayList; 14import java.util.Arrays; 15import java.util.Collections; 16import java.util.List; 17 18 19/** 20 * <code>CharsetDetector</code> provides a facility for detecting the 21 * charset or encoding of character data in an unknown format. 22 * The input data can either be from an input stream or an array of bytes. 23 * The result of the detection operation is a list of possibly matching 24 * charsets, or, for simple use, you can just ask for a Java Reader that 25 * will will work over the input data. 26 * <p/> 27 * Character set detection is at best an imprecise operation. The detection 28 * process will attempt to identify the charset that best matches the characteristics 29 * of the byte data, but the process is partly statistical in nature, and 30 * the results can not be guaranteed to always be correct. 31 * <p/> 32 * For best accuracy in charset detection, the input data should be primarily 33 * in a single language, and a minimum of a few hundred bytes worth of plain text 34 * in the language are needed. The detection process will attempt to 35 * ignore html or xml style markup that could otherwise obscure the content. 36 * <p/> 37 * @hide Only a subset of ICU is exposed in Android 38 * @hide All android.icu classes are currently hidden 39 */ 40public class CharsetDetector { 41 42// Question: Should we have getters corresponding to the setters for input text 43// and declared encoding? 44 45// A thought: If we were to create our own type of Java Reader, we could defer 46// figuring out an actual charset for data that starts out with too much English 47// only ASCII until the user actually read through to something that didn't look 48// like 7 bit English. If nothing else ever appeared, we would never need to 49// actually choose the "real" charset. All assuming that the application just 50// wants the data, and doesn't care about a char set name. 51 52 /** 53 * Constructor 54 */ 55 public CharsetDetector() { 56 } 57 58 /** 59 * Set the declared encoding for charset detection. 60 * The declared encoding of an input text is an encoding obtained 61 * from an http header or xml declaration or similar source that 62 * can be provided as additional information to the charset detector. 63 * A match between a declared encoding and a possible detected encoding 64 * will raise the quality of that detected encoding by a small delta, 65 * and will also appear as a "reason" for the match. 66 * <p/> 67 * A declared encoding that is incompatible with the input data being 68 * analyzed will not be added to the list of possible encodings. 69 * 70 * @param encoding The declared encoding 71 */ 72 public CharsetDetector setDeclaredEncoding(String encoding) { 73 fDeclaredEncoding = encoding; 74 return this; 75 } 76 77 /** 78 * Set the input text (byte) data whose charset is to be detected. 79 * 80 * @param in the input text of unknown encoding 81 * 82 * @return This CharsetDetector 83 */ 84 public CharsetDetector setText(byte [] in) { 85 fRawInput = in; 86 fRawLength = in.length; 87 88 return this; 89 } 90 91 private static final int kBufSize = 8000; 92 93 /** 94 * Set the input text (byte) data whose charset is to be detected. 95 * <p/> 96 * The input stream that supplies the character data must have markSupported() 97 * == true; the charset detection process will read a small amount of data, 98 * then return the stream to its original position via 99 * the InputStream.reset() operation. The exact amount that will 100 * be read depends on the characteristics of the data itself. 101 * 102 * @param in the input text of unknown encoding 103 * 104 * @return This CharsetDetector 105 */ 106 107 public CharsetDetector setText(InputStream in) throws IOException { 108 fInputStream = in; 109 fInputStream.mark(kBufSize); 110 fRawInput = new byte[kBufSize]; // Always make a new buffer because the 111 // previous one may have come from the caller, 112 // in which case we can't touch it. 113 fRawLength = 0; 114 int remainingLength = kBufSize; 115 while (remainingLength > 0 ) { 116 // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. 117 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); 118 if (bytesRead <= 0) { 119 break; 120 } 121 fRawLength += bytesRead; 122 remainingLength -= bytesRead; 123 } 124 fInputStream.reset(); 125 126 return this; 127 } 128 129 130 /** 131 * Return the charset that best matches the supplied input data. 132 * 133 * Note though, that because the detection 134 * only looks at the start of the input data, 135 * there is a possibility that the returned charset will fail to handle 136 * the full set of input data. 137 * <p/> 138 * Raise an exception if 139 * <ul> 140 * <li>no charset appears to match the data.</li> 141 * <li>no input text has been provided</li> 142 * </ul> 143 * 144 * @return a CharsetMatch object representing the best matching charset, or 145 * <code>null</code> if there are no matches. 146 */ 147 public CharsetMatch detect() { 148// TODO: A better implementation would be to copy the detect loop from 149// detectAll(), and cut it short as soon as a match with a high confidence 150// is found. This is something to be done later, after things are otherwise 151// working. 152 CharsetMatch matches[] = detectAll(); 153 154 if (matches == null || matches.length == 0) { 155 return null; 156 } 157 158 return matches[0]; 159 } 160 161 /** 162 * Return an array of all charsets that appear to be plausible 163 * matches with the input data. The array is ordered with the 164 * best quality match first. 165 * <p/> 166 * Raise an exception if 167 * <ul> 168 * <li>no charsets appear to match the input data.</li> 169 * <li>no input text has been provided</li> 170 * </ul> 171 * 172 * @return An array of CharsetMatch objects representing possibly matching charsets. 173 */ 174 public CharsetMatch[] detectAll() { 175 ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); 176 177 MungeInput(); // Strip html markup, collect byte stats. 178 179 // Iterate over all possible charsets, remember all that 180 // give a match quality > 0. 181 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 182 CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 183 boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; 184 if (active) { 185 CharsetMatch m = rcinfo.recognizer.match(this); 186 if (m != null) { 187 matches.add(m); 188 } 189 } 190 } 191 Collections.sort(matches); // CharsetMatch compares on confidence 192 Collections.reverse(matches); // Put best match first. 193 CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; 194 resultArray = matches.toArray(resultArray); 195 return resultArray; 196 } 197 198 199 /** 200 * Autodetect the charset of an inputStream, and return a Java Reader 201 * to access the converted input data. 202 * <p/> 203 * This is a convenience method that is equivalent to 204 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code> 205 * <p/> 206 * For the input stream that supplies the character data, markSupported() 207 * must be true; the charset detection will read a small amount of data, 208 * then return the stream to its original position via 209 * the InputStream.reset() operation. The exact amount that will 210 * be read depends on the characteristics of the data itself. 211 *<p/> 212 * Raise an exception if no charsets appear to match the input data. 213 * 214 * @param in The source of the byte data in the unknown charset. 215 * 216 * @param declaredEncoding A declared encoding for the data, if available, 217 * or null or an empty string if none is available. 218 */ 219 public Reader getReader(InputStream in, String declaredEncoding) { 220 fDeclaredEncoding = declaredEncoding; 221 222 try { 223 setText(in); 224 225 CharsetMatch match = detect(); 226 227 if (match == null) { 228 return null; 229 } 230 231 return match.getReader(); 232 } catch (IOException e) { 233 return null; 234 } 235 } 236 237 /** 238 * Autodetect the charset of an inputStream, and return a String 239 * containing the converted input data. 240 * <p/> 241 * This is a convenience method that is equivalent to 242 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code> 243 *<p/> 244 * Raise an exception if no charsets appear to match the input data. 245 * 246 * @param in The source of the byte data in the unknown charset. 247 * 248 * @param declaredEncoding A declared encoding for the data, if available, 249 * or null or an empty string if none is available. 250 */ 251 public String getString(byte[] in, String declaredEncoding) 252 { 253 fDeclaredEncoding = declaredEncoding; 254 255 try { 256 setText(in); 257 258 CharsetMatch match = detect(); 259 260 if (match == null) { 261 return null; 262 } 263 264 return match.getString(-1); 265 } catch (IOException e) { 266 return null; 267 } 268 } 269 270 271 /** 272 * Get the names of all charsets supported by <code>CharsetDetector</code> class. 273 * <p> 274 * <b>Note:</b> Multiple different charset encodings in a same family may use 275 * a single shared name in this implementation. For example, this method returns 276 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 277 * (Windows Latin 1). However, actual detection result could be "windows-1252" 278 * when the input data matches Latin 1 code points with any points only available 279 * in "windows-1252". 280 * 281 * @return an array of the names of all charsets supported by 282 * <code>CharsetDetector</code> class. 283 */ 284 public static String[] getAllDetectableCharsets() { 285 String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()]; 286 for (int i = 0; i < allCharsetNames.length; i++) { 287 allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName(); 288 } 289 return allCharsetNames; 290 } 291 292 /** 293 * Test whether or not input filtering is enabled. 294 * 295 * @return <code>true</code> if input text will be filtered. 296 * 297 * @see #enableInputFilter 298 */ 299 public boolean inputFilterEnabled() 300 { 301 return fStripTags; 302 } 303 304 /** 305 * Enable filtering of input text. If filtering is enabled, 306 * text within angle brackets ("<" and ">") will be removed 307 * before detection. 308 * 309 * @param filter <code>true</code> to enable input text filtering. 310 * 311 * @return The previous setting. 312 */ 313 public boolean enableInputFilter(boolean filter) 314 { 315 boolean previous = fStripTags; 316 317 fStripTags = filter; 318 319 return previous; 320 } 321 322 /* 323 * MungeInput - after getting a set of raw input data to be analyzed, preprocess 324 * it by removing what appears to be html markup. 325 */ 326 private void MungeInput() { 327 int srci = 0; 328 int dsti = 0; 329 byte b; 330 boolean inMarkup = false; 331 int openTags = 0; 332 int badTags = 0; 333 334 // 335 // html / xml markup stripping. 336 // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 337 // discard everything within < brackets > 338 // Count how many total '<' and illegal (nested) '<' occur, so we can make some 339 // guess as to whether the input was actually marked up at all. 340 if (fStripTags) { 341 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { 342 b = fRawInput[srci]; 343 if (b == (byte)'<') { 344 if (inMarkup) { 345 badTags++; 346 } 347 inMarkup = true; 348 openTags++; 349 } 350 351 if (! inMarkup) { 352 fInputBytes[dsti++] = b; 353 } 354 355 if (b == (byte)'>') { 356 inMarkup = false; 357 } 358 } 359 360 fInputLen = dsti; 361 } 362 363 // 364 // If it looks like this input wasn't marked up, or if it looks like it's 365 // essentially nothing but markup abandon the markup stripping. 366 // Detection will have to work on the unstripped input. 367 // 368 if (openTags<5 || openTags/5 < badTags || 369 (fInputLen < 100 && fRawLength>600)) { 370 int limit = fRawLength; 371 372 if (limit > kBufSize) { 373 limit = kBufSize; 374 } 375 376 for (srci=0; srci<limit; srci++) { 377 fInputBytes[srci] = fRawInput[srci]; 378 } 379 fInputLen = srci; 380 } 381 382 // 383 // Tally up the byte occurence statistics. 384 // These are available for use by the various detectors. 385 // 386 Arrays.fill(fByteStats, (short)0); 387 for (srci=0; srci<fInputLen; srci++) { 388 int val = fInputBytes[srci] & 0x00ff; 389 fByteStats[val]++; 390 } 391 392 fC1Bytes = false; 393 for (int i = 0x80; i <= 0x9F; i += 1) { 394 if (fByteStats[i] != 0) { 395 fC1Bytes = true; 396 break; 397 } 398 } 399 } 400 401 /* 402 * The following items are accessed by individual CharsetRecongizers during 403 * the recognition process 404 * 405 */ 406 byte[] fInputBytes = // The text to be checked. Markup will have been 407 new byte[kBufSize]; // removed if appropriate. 408 409 int fInputLen; // Length of the byte data in fInputBytes. 410 411 short fByteStats[] = // byte frequency statistics for the input text. 412 new short[256]; // Value is percent, not absolute. 413 // Value is rounded up, so zero really means zero occurences. 414 415 boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input; 416 false; 417 418 String fDeclaredEncoding; 419 420 421 byte[] fRawInput; // Original, untouched input bytes. 422 // If user gave us a byte array, this is it. 423 // If user gave us a stream, it's read to a 424 // buffer here. 425 int fRawLength; // Length of data in fRawInput array. 426 427 InputStream fInputStream; // User's input stream, or null if the user 428 // gave us a byte array. 429 430 // 431 // Stuff private to CharsetDetector 432 // 433 private boolean fStripTags = // If true, setText() will strip tags from input text. 434 false; 435 436 private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had 437 // been changed from the default. The array index is 438 // corresponding to ALL_RECOGNIZER. See setDetectableCharset(). 439 440 private static class CSRecognizerInfo { 441 CharsetRecognizer recognizer; 442 boolean isDefaultEnabled; 443 444 CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) { 445 this.recognizer = recognizer; 446 this.isDefaultEnabled = isDefaultEnabled; 447 } 448 } 449 450 /* 451 * List of recognizers for all charsets known to the implementation. 452 */ 453 private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS; 454 455 static { 456 List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>(); 457 458 list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true)); 459 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true)); 460 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true)); 461 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true)); 462 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true)); 463 464 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true)); 465 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true)); 466 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true)); 467 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true)); 468 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true)); 469 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true)); 470 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true)); 471 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true)); 472 473 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true)); 474 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true)); 475 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true)); 476 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true)); 477 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true)); 478 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true)); 479 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true)); 480 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true)); 481 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true)); 482 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true)); 483 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true)); 484 485 // IBM 420/424 recognizers are disabled by default 486 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); 487 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); 488 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); 489 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); 490 491 ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); 492 } 493 494 /** 495 * Get the names of charsets that can be recognized by this CharsetDetector instance. 496 * 497 * @return an array of the names of charsets that can be recognized by this CharsetDetector 498 * instance. 499 * 500 * @deprecated This API is ICU internal only. 501 * @hide draft / provisional / internal are hidden on Android 502 */ 503 @Deprecated 504 public String[] getDetectableCharsets() { 505 List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size()); 506 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 507 CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 508 boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i]; 509 if (active) { 510 csnames.add(rcinfo.recognizer.getName()); 511 } 512 } 513 return csnames.toArray(new String[csnames.size()]); 514 } 515 516 /** 517 * Enable or disable individual charset encoding. 518 * A name of charset encoding must be included in the names returned by 519 * {@link #getAllDetectableCharsets()}. 520 * 521 * @param encoding the name of charset encoding. 522 * @param enabled <code>true</code> to enable, or <code>false</code> to disable the 523 * charset encoding. 524 * @return A reference to this <code>CharsetDetector</code>. 525 * @throws IllegalArgumentException when the name of charset encoding is 526 * not supported. 527 * 528 * @deprecated This API is ICU internal only. 529 * @hide draft / provisional / internal are hidden on Android 530 */ 531 @Deprecated 532 public CharsetDetector setDetectableCharset(String encoding, boolean enabled) { 533 int modIdx = -1; 534 boolean isDefaultVal = false; 535 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 536 CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i); 537 if (csrinfo.recognizer.getName().equals(encoding)) { 538 modIdx = i; 539 isDefaultVal = (csrinfo.isDefaultEnabled == enabled); 540 break; 541 } 542 } 543 if (modIdx < 0) { 544 // No matching encoding found 545 throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\""); 546 } 547 548 if (fEnabledRecognizers == null && !isDefaultVal) { 549 // Create an array storing the non default setting 550 fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()]; 551 552 // Initialize the array with default info 553 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 554 fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled; 555 } 556 } 557 558 if (fEnabledRecognizers != null) { 559 fEnabledRecognizers[modIdx] = enabled; 560 } 561 562 return this; 563 } 564} 565