17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2005 - 2014, International Business Machines Corporation and * 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* others. All Rights Reserved. * 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Charset recognizer for UTF-8 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertclass CharsetRecog_UTF8 extends CharsetRecognizer { 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String getName() { 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return "UTF-8"; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* (non-Javadoc) 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsetMatch match(CharsetDetector det) { 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean hasBOM = false; 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numValid = 0; 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numInvalid = 0; 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert byte input[] = det.fRawInput; 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i; 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int trailBytes = 0; 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int confidence; 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (det.fRawLength >= 3 && 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) { 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert hasBOM = true; 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Scan for multi-byte sequences 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (i=0; i<det.fRawLength; i++) { 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int b = input[i]; 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((b & 0x80) == 0) { 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; // ASCII 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hi bit on char found. Figure out how long the sequence should be 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((b & 0x0e0) == 0x0c0) { 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert trailBytes = 1; 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if ((b & 0x0f0) == 0x0e0) { 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert trailBytes = 2; 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if ((b & 0x0f8) == 0xf0) { 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert trailBytes = 3; 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numInvalid++; 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Verify that we've got the right number of trail bytes in the sequence 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i++; 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (i>=det.fRawLength) { 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert b = input[i]; 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((b & 0xc0) != 0x080) { 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numInvalid++; 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (--trailBytes == 0) { 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numValid++; 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Cook up some sort of confidence score, based on presense of a BOM 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and the existence of valid and/or invalid multi-byte sequences. 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 0; 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (hasBOM && numInvalid==0) { 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 100; 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (hasBOM && numValid > numInvalid*10) { 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 80; 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (numValid > 3 && numInvalid == 0) { 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 100; 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (numValid > 0 && numInvalid == 0) { 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 80; 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (numValid == 0 && numInvalid == 0) { 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // accepts ASCII with confidence = 10. 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: add plain ASCII as an explicitly detected type. 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 15; 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (numValid > numInvalid*10) { 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Probably corruput utf-8 data. Valid sequences aren't likely by chance. 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert confidence = 25; 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 96