1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 1996-2013, International Business Machines Corporation and    *
7 * others. All Rights Reserved.                                                *
8 *******************************************************************************
9 *
10 */
11
12package android.icu.text;
13
14/**
15 * This class matches UTF-16 and UTF-32, both big- and little-endian. The
16 * BOM will be used if it is present.
17 */
18abstract class CharsetRecog_Unicode extends CharsetRecognizer {
19
20    /* (non-Javadoc)
21     * @see android.icu.text.CharsetRecognizer#getName()
22     */
23    @Override
24    abstract String getName();
25
26    /* (non-Javadoc)
27     * @see android.icu.text.CharsetRecognizer#match(android.icu.text.CharsetDetector)
28     */
29    @Override
30    abstract CharsetMatch match(CharsetDetector det);
31
32    static int codeUnit16FromBytes(byte hi, byte lo) {
33        return ((hi & 0xff) << 8) | (lo & 0xff);
34    }
35
36    // UTF-16 confidence calculation. Very simple minded, but better than nothing.
37    //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
38    //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
39    //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
40    //   NULs should be rare in actual text.
41    static int adjustConfidence(int codeUnit, int confidence) {
42        if (codeUnit == 0) {
43            confidence -= 10;
44        } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
45            confidence += 10;
46        }
47        if (confidence < 0) {
48            confidence = 0;
49        } else if (confidence > 100) {
50            confidence = 100;
51        }
52        return confidence;
53    }
54
55    static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
56    {
57        @Override
58        String getName()
59        {
60            return "UTF-16BE";
61        }
62
63        @Override
64        CharsetMatch match(CharsetDetector det)
65        {
66            byte[] input = det.fRawInput;
67            int confidence = 10;
68
69            int bytesToCheck = Math.min(input.length, 30);
70            for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
71                int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
72                if (charIndex == 0 && codeUnit == 0xFEFF) {
73                    confidence = 100;
74                    break;
75                }
76                confidence = adjustConfidence(codeUnit, confidence);
77                if (confidence == 0 || confidence == 100) {
78                    break;
79                }
80            }
81            if (bytesToCheck < 4 && confidence < 100) {
82                confidence = 0;
83            }
84            if (confidence > 0) {
85                return new CharsetMatch(det, this, confidence);
86            }
87            return null;
88        }
89    }
90
91    static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
92    {
93        @Override
94        String getName()
95        {
96            return "UTF-16LE";
97        }
98
99        @Override
100        CharsetMatch match(CharsetDetector det)
101        {
102            byte[] input = det.fRawInput;
103            int confidence = 10;
104
105            int bytesToCheck = Math.min(input.length, 30);
106            for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
107                int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
108                if (charIndex == 0 && codeUnit == 0xFEFF) {
109                    confidence = 100;
110                    break;
111                }
112                confidence = adjustConfidence(codeUnit, confidence);
113                if (confidence == 0 || confidence == 100) {
114                    break;
115                }
116            }
117            if (bytesToCheck < 4 && confidence < 100) {
118                confidence = 0;
119            }
120            if (confidence > 0) {
121                return new CharsetMatch(det, this, confidence);
122            }
123            return null;
124        }
125    }
126
127    static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
128    {
129        abstract int getChar(byte[] input, int index);
130
131        @Override
132        abstract String getName();
133
134        @Override
135        CharsetMatch match(CharsetDetector det)
136        {
137            byte[] input   = det.fRawInput;
138            int limit      = (det.fRawLength / 4) * 4;
139            int numValid   = 0;
140            int numInvalid = 0;
141            boolean hasBOM = false;
142            int confidence = 0;
143
144            if (limit==0) {
145                return null;
146            }
147            if (getChar(input, 0) == 0x0000FEFF) {
148                hasBOM = true;
149            }
150
151            for(int i = 0; i < limit; i += 4) {
152                int ch = getChar(input, i);
153
154                if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
155                    numInvalid += 1;
156                } else {
157                    numValid += 1;
158                }
159            }
160
161
162            // Cook up some sort of confidence score, based on presence of a BOM
163            //    and the existence of valid and/or invalid multi-byte sequences.
164            if (hasBOM && numInvalid==0) {
165                confidence = 100;
166            } else if (hasBOM && numValid > numInvalid*10) {
167                confidence = 80;
168            } else if (numValid > 3 && numInvalid == 0) {
169                confidence = 100;
170            } else if (numValid > 0 && numInvalid == 0) {
171                confidence = 80;
172            } else if (numValid > numInvalid*10) {
173                // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
174                confidence = 25;
175            }
176
177            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
178        }
179    }
180
181    static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
182    {
183        @Override
184        int getChar(byte[] input, int index)
185        {
186            return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
187                   (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
188        }
189
190        @Override
191        String getName()
192        {
193            return "UTF-32BE";
194        }
195    }
196
197
198    static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
199    {
200        @Override
201        int getChar(byte[] input, int index)
202        {
203            return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
204                   (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
205        }
206
207        @Override
208        String getName()
209        {
210            return "UTF-32LE";
211        }
212    }
213}
214