1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/**
4*******************************************************************************
5* Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
6* others. All Rights Reserved.                                                *
7*******************************************************************************
8*/
9package com.ibm.icu.text;
10
11/**
12 * Charset recognizer for UTF-8
13 */
14class CharsetRecog_UTF8 extends CharsetRecognizer {
15
16    @Override
17    String getName() {
18        return "UTF-8";
19    }
20
21    /* (non-Javadoc)
22     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
23     */
24    @Override
25    CharsetMatch match(CharsetDetector det) {
26        boolean     hasBOM = false;
27        int         numValid = 0;
28        int         numInvalid = 0;
29        byte        input[] = det.fRawInput;
30        int         i;
31        int         trailBytes = 0;
32        int         confidence;
33
34        if (det.fRawLength >= 3 &&
35                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
36            hasBOM = true;
37        }
38
39        // Scan for multi-byte sequences
40        for (i=0; i<det.fRawLength; i++) {
41            int b = input[i];
42            if ((b & 0x80) == 0) {
43                continue;   // ASCII
44            }
45
46            // Hi bit on char found.  Figure out how long the sequence should be
47            if ((b & 0x0e0) == 0x0c0) {
48                trailBytes = 1;
49            } else if ((b & 0x0f0) == 0x0e0) {
50                trailBytes = 2;
51            } else if ((b & 0x0f8) == 0xf0) {
52                trailBytes = 3;
53            } else {
54                numInvalid++;
55                continue;
56            }
57
58            // Verify that we've got the right number of trail bytes in the sequence
59            for (;;) {
60                i++;
61                if (i>=det.fRawLength) {
62                    break;
63                }
64                b = input[i];
65                if ((b & 0xc0) != 0x080) {
66                    numInvalid++;
67                    break;
68                }
69                if (--trailBytes == 0) {
70                    numValid++;
71                    break;
72                }
73            }
74        }
75
76        // Cook up some sort of confidence score, based on presense of a BOM
77        //    and the existence of valid and/or invalid multi-byte sequences.
78        confidence = 0;
79        if (hasBOM && numInvalid==0) {
80            confidence = 100;
81        } else if (hasBOM && numValid > numInvalid*10) {
82            confidence = 80;
83        } else if (numValid > 3 && numInvalid == 0) {
84            confidence = 100;
85        } else if (numValid > 0 && numInvalid == 0) {
86            confidence = 80;
87        } else if (numValid == 0 && numInvalid == 0) {
88            // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
89            //              accepts ASCII with confidence = 10.
90            // TODO: add plain ASCII as an explicitly detected type.
91            confidence = 15;
92        } else if (numValid > numInvalid*10) {
93            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
94            confidence = 25;
95        }
96        return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
97    }
98
99}
100