17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* others. All Rights Reserved.                                                *
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Charset recognizer for UTF-8
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertclass CharsetRecog_UTF8 extends CharsetRecognizer {
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    String getName() {
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return "UTF-8";
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /* (non-Javadoc)
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    CharsetMatch match(CharsetDetector det) {
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean     hasBOM = false;
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int         numValid = 0;
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int         numInvalid = 0;
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        byte        input[] = det.fRawInput;
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int         i;
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int         trailBytes = 0;
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int         confidence;
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (det.fRawLength >= 3 &&
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            hasBOM = true;
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Scan for multi-byte sequences
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (i=0; i<det.fRawLength; i++) {
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int b = input[i];
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if ((b & 0x80) == 0) {
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                continue;   // ASCII
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Hi bit on char found.  Figure out how long the sequence should be
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if ((b & 0x0e0) == 0x0c0) {
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                trailBytes = 1;
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else if ((b & 0x0f0) == 0x0e0) {
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                trailBytes = 2;
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else if ((b & 0x0f8) == 0xf0) {
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                trailBytes = 3;
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                numInvalid++;
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                continue;
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Verify that we've got the right number of trail bytes in the sequence
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (;;) {
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                i++;
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (i>=det.fRawLength) {
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                b = input[i];
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if ((b & 0xc0) != 0x080) {
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    numInvalid++;
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (--trailBytes == 0) {
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    numValid++;
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Cook up some sort of confidence score, based on presense of a BOM
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //    and the existence of valid and/or invalid multi-byte sequences.
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        confidence = 0;
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (hasBOM && numInvalid==0) {
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            confidence = 100;
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else if (hasBOM && numValid > numInvalid*10) {
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            confidence = 80;
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else if (numValid > 3 && numInvalid == 0) {
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            confidence = 100;
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else if (numValid > 0 && numInvalid == 0) {
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            confidence = 80;
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else if (numValid == 0 && numInvalid == 0) {
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //              accepts ASCII with confidence = 10.
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // TODO: add plain ASCII as an explicitly detected type.
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            confidence = 15;
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else if (numValid > numInvalid*10) {
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            confidence = 25;
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
96