1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/** 4******************************************************************************* 5* Copyright (C) 2005 - 2014, International Business Machines Corporation and * 6* others. All Rights Reserved. * 7******************************************************************************* 8*/ 9package com.ibm.icu.text; 10 11/** 12 * Charset recognizer for UTF-8 13 */ 14class CharsetRecog_UTF8 extends CharsetRecognizer { 15 16 @Override 17 String getName() { 18 return "UTF-8"; 19 } 20 21 /* (non-Javadoc) 22 * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) 23 */ 24 @Override 25 CharsetMatch match(CharsetDetector det) { 26 boolean hasBOM = false; 27 int numValid = 0; 28 int numInvalid = 0; 29 byte input[] = det.fRawInput; 30 int i; 31 int trailBytes = 0; 32 int confidence; 33 34 if (det.fRawLength >= 3 && 35 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) { 36 hasBOM = true; 37 } 38 39 // Scan for multi-byte sequences 40 for (i=0; i<det.fRawLength; i++) { 41 int b = input[i]; 42 if ((b & 0x80) == 0) { 43 continue; // ASCII 44 } 45 46 // Hi bit on char found. Figure out how long the sequence should be 47 if ((b & 0x0e0) == 0x0c0) { 48 trailBytes = 1; 49 } else if ((b & 0x0f0) == 0x0e0) { 50 trailBytes = 2; 51 } else if ((b & 0x0f8) == 0xf0) { 52 trailBytes = 3; 53 } else { 54 numInvalid++; 55 continue; 56 } 57 58 // Verify that we've got the right number of trail bytes in the sequence 59 for (;;) { 60 i++; 61 if (i>=det.fRawLength) { 62 break; 63 } 64 b = input[i]; 65 if ((b & 0xc0) != 0x080) { 66 numInvalid++; 67 break; 68 } 69 if (--trailBytes == 0) { 70 numValid++; 71 break; 72 } 73 } 74 } 75 76 // Cook up some sort of confidence score, based on presense of a BOM 77 // and the existence of valid and/or invalid multi-byte sequences. 78 confidence = 0; 79 if (hasBOM && numInvalid==0) { 80 confidence = 100; 81 } else if (hasBOM && numValid > numInvalid*10) { 82 confidence = 80; 83 } else if (numValid > 3 && numInvalid == 0) { 84 confidence = 100; 85 } else if (numValid > 0 && numInvalid == 0) { 86 confidence = 80; 87 } else if (numValid == 0 && numInvalid == 0) { 88 // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which 89 // accepts ASCII with confidence = 10. 90 // TODO: add plain ASCII as an explicitly detected type. 91 confidence = 15; 92 } else if (numValid > numInvalid*10) { 93 // Probably corruput utf-8 data. Valid sequences aren't likely by chance. 94 confidence = 25; 95 } 96 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 97 } 98 99} 100