1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "cmemory.h"
11
12#if !UCONFIG_NO_CONVERSION
13#include "csrsbcs.h"
14#include "csmatch.h"
15
16#define N_GRAM_SIZE 3
17#define N_GRAM_MASK 0xFFFFFF
18#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20U_NAMESPACE_BEGIN
21
22NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23  :byteIndex(0), ngram(0)
24{
25    ngramList = theNgramList;
26    charMap   = theCharMap;
27
28    ngramCount = hitCount = 0;
29}
30
31/*
32 * Binary search for value in table, which must have exactly 64 entries.
33 */
34
35int32_t NGramParser::search(const int32_t *table, int32_t value)
36{
37    int32_t index = 0;
38
39    if (table[index + 32] <= value) {
40        index += 32;
41    }
42
43    if (table[index + 16] <= value) {
44        index += 16;
45    }
46
47    if (table[index + 8] <= value) {
48        index += 8;
49    }
50
51    if (table[index + 4] <= value) {
52        index += 4;
53    }
54
55    if (table[index + 2] <= value) {
56        index += 2;
57    }
58
59    if (table[index + 1] <= value) {
60        index += 1;
61    }
62
63    if (table[index] > value) {
64        index -= 1;
65    }
66
67    if (index < 0 || table[index] != value) {
68        return -1;
69    }
70
71    return index;
72}
73
74void NGramParser::lookup(int32_t thisNgram)
75{
76    ngramCount += 1;
77
78    if (search(ngramList, thisNgram) >= 0) {
79        hitCount += 1;
80    }
81
82}
83
84void NGramParser::addByte(int32_t b)
85{
86    ngram = ((ngram << 8) + b) & N_GRAM_MASK;
87    lookup(ngram);
88}
89
90int32_t NGramParser::nextByte(InputText *det)
91{
92    if (byteIndex >= det->fInputLen) {
93        return -1;
94    }
95
96    return det->fInputBytes[byteIndex++];
97}
98
99int32_t NGramParser::parse(InputText *det)
100{
101    int32_t b;
102    bool ignoreSpace = FALSE;
103
104    while ((b = nextByte(det)) >= 0) {
105        uint8_t mb = charMap[b];
106
107        // TODO: 0x20 might not be a space in all character sets...
108        if (mb != 0) {
109            if (!(mb == 0x20 && ignoreSpace)) {
110                addByte(mb);
111            }
112
113            ignoreSpace = (mb == 0x20);
114        }
115    }
116
117    // TODO: Is this OK? The buffer could have ended in the middle of a word...
118    addByte(0x20);
119
120    double rawPercent = (double) hitCount / (double) ngramCount;
121
122    //            if (rawPercent <= 2.0) {
123    //                return 0;
124    //            }
125
126    // TODO - This is a bit of a hack to take care of a case
127    // were we were getting a confidence of 135...
128    if (rawPercent > 0.33) {
129        return 98;
130    }
131
132    return (int32_t) (rawPercent * 300.0);
133}
134
135CharsetRecog_sbcs::CharsetRecog_sbcs()
136{
137    // nothing else to do
138}
139
140CharsetRecog_sbcs::~CharsetRecog_sbcs()
141{
142    // nothing to do
143}
144
145int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
146{
147    NGramParser parser(ngrams, byteMap);
148    int32_t result;
149
150    result = parser.parse(det);
151
152    return result;
153}
154
155static const uint8_t charMap_8859_1[] = {
156    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
157    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
158    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
159    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
160    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
161    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
162    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
163    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
164    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
165    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
166    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
167    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
168    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
169    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
170    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
171    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
172    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
173    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
174    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
175    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
176    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
177    0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
178    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
179    0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
180    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
181    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
182    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
183    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
184    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
185    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
186    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
187    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
188};
189
190static const uint8_t charMap_8859_2[] = {
191    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
192    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
193    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
194    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
195    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
196    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
197    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
198    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
199    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
200    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
201    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
202    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
203    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
204    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
205    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
206    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
207    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
208    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
209    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
210    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
211    0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
212    0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
213    0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
214    0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
215    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
216    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
217    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
218    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
219    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
220    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
221    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
222    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
223};
224
225static const uint8_t charMap_8859_5[] = {
226    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
227    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
228    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
229    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
230    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
231    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
232    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
233    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
234    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
235    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
236    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
237    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
238    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
239    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
240    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
241    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
242    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
243    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
244    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
245    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
246    0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
247    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
248    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
249    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
250    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
251    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
252    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
253    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
254    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
255    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
256    0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
257    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
258};
259
260static const uint8_t charMap_8859_6[] = {
261    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
262    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
266    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
279    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285    0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
286    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
287    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
288    0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
289    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290    0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
291    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
292    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293};
294
295static const uint8_t charMap_8859_7[] = {
296    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
297    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
301    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
313    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
314    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
315    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316    0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
317    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
319    0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
320    0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322    0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
323    0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
324    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
327    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
328};
329
330static const uint8_t charMap_8859_8[] = {
331    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
332    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
336    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
348    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
349    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
350    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
354    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
356    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
357    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
358    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
359    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
362    0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
363};
364
365static const uint8_t charMap_8859_9[] = {
366    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
367    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
371    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387    0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
388    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
389    0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
390    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
392    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
393    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
394    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
395    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
396    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
397    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
398};
399
400static const int32_t ngrams_windows_1251[] = {
401    0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
402    0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
403    0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
404    0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
405};
406
407static const uint8_t charMap_windows_1251[] = {
408    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
413    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
417    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
418    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
419    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
420    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
421    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
422    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
423    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
424    0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
425    0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
426    0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427    0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
428    0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
429    0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
430    0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
431    0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
432    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
433    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
434    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
435    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
436    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
437    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
438    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
439    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
440};
441
442static const int32_t ngrams_windows_1256[] = {
443    0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
444    0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
445    0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
446    0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
447};
448
449static const uint8_t charMap_windows_1256[] = {
450    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
455    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
459    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
460    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
461    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
462    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
463    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
464    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
465    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
466    0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
467    0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
468    0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469    0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
470    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471    0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
472    0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
473    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
475    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
476    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
477    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
478    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
479    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
480    0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
481    0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
482};
483
484static const int32_t ngrams_KOI8_R[] = {
485    0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
486    0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
487    0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
488    0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
489};
490
491static const uint8_t charMap_KOI8_R[] = {
492    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
497    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
498    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
499    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
500    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
501    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
502    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
503    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
504    0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
505    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
506    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
507    0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
508    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
509    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512    0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
513    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514    0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
515    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
517    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
518    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
519    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
520    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
521    0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
522    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
523    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
524};
525
526static const int32_t ngrams_IBM424_he_rtl[] = {
527    0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
528    0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
529    0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
530    0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
531};
532
533static const int32_t ngrams_IBM424_he_ltr[] = {
534    0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
535    0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
536    0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
537    0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
538};
539
540static const uint8_t charMap_IBM424_he[] = {
541/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
542/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
543/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
544/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
545/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
546/* 4- */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
547/* 5- */    0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
548/* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
549/* 7- */    0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
550/* 8- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
551/* 9- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
552/* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
553/* B- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
554/* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
555/* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
556/* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
557/* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
558};
559
560static const int32_t ngrams_IBM420_ar_rtl[] = {
561    0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
562    0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
563    0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
564    0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
565};
566
567static const int32_t ngrams_IBM420_ar_ltr[] = {
568    0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
569    0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
570    0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
571    0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
572};
573
574static const uint8_t charMap_IBM420_ar[]= {
575/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
576/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
577/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
578/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
579/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
580/* 4- */    0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
581/* 5- */    0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
582/* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
583/* 7- */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
584/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
585/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
586/* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
587/* B- */    0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
588/* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
589/* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
590/* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
591/* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
592};
593
594//ISO-8859-1,2,5,6,7,8,9 Ngrams
595
596struct NGramsPlusLang {
597    const int32_t ngrams[64];
598    const char *  lang;
599};
600
601static const NGramsPlusLang ngrams_8859_1[] =  {
602  {
603    {
604    0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
605    0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
606    0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
607    0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
608    },
609    "en"
610  },
611  {
612    {
613    0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
614    0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
615    0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
616    0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
617    },
618    "da"
619  },
620  {
621    {
622    0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
623    0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
624    0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
625    0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
626    },
627    "de"
628  },
629  {
630    {
631    0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
632    0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
633    0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
634    0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
635    },
636    "es"
637  },
638  {
639    {
640    0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
641    0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
642    0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
643    0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
644    },
645    "fr"
646  },
647  {
648    {
649    0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
650    0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
651    0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
652    0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
653    },
654    "it"
655  },
656  {
657    {
658    0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
659    0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
660    0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
661    0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
662    },
663    "nl"
664  },
665  {
666    {
667    0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
668    0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
669    0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
670    0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
671    },
672    "no"
673  },
674  {
675    {
676    0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
677    0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
678    0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
679    0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
680    },
681    "pt"
682  },
683  {
684    {
685    0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
686    0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
687    0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
688    0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
689    },
690    "sv"
691  }
692};
693
694
695static const NGramsPlusLang ngrams_8859_2[] =  {
696  {
697    {
698    0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
699    0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
700    0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
701    0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
702    },
703    "cs"
704  },
705  {
706    {
707    0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
708    0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
709    0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
710    0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
711    },
712    "hu"
713  },
714  {
715    {
716    0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
717    0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
718    0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
719    0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
720    },
721    "pl"
722  },
723  {
724    {
725    0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
726    0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
727    0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
728    0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
729    },
730    "ro"
731  }
732};
733
734static const int32_t ngrams_8859_5_ru[] = {
735    0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
736    0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
737    0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
738    0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
739};
740
741static const int32_t ngrams_8859_6_ar[] = {
742    0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
743    0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
744    0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
745    0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
746};
747
748static const int32_t ngrams_8859_7_el[] = {
749    0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
750    0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
751    0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
752    0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
753};
754
755static const int32_t ngrams_8859_8_I_he[] = {
756    0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
757    0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
758    0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
759    0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
760};
761
762static const int32_t ngrams_8859_8_he[] = {
763    0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
764    0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
765    0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
766    0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
767};
768
769static const int32_t ngrams_8859_9_tr[] = {
770    0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
771    0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
772    0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
773    0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
774};
775
776CharsetRecog_8859_1::~CharsetRecog_8859_1()
777{
778    // nothing to do
779}
780
781UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
782    const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
783    uint32_t i;
784    int32_t bestConfidenceSoFar = -1;
785    for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
786        const int32_t *ngrams = ngrams_8859_1[i].ngrams;
787        const char    *lang   = ngrams_8859_1[i].lang;
788        int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
789        if (confidence > bestConfidenceSoFar) {
790            results->set(textIn, this, confidence, name, lang);
791            bestConfidenceSoFar = confidence;
792        }
793    }
794    return (bestConfidenceSoFar > 0);
795}
796
797const char *CharsetRecog_8859_1::getName() const
798{
799    return "ISO-8859-1";
800}
801
802
803CharsetRecog_8859_2::~CharsetRecog_8859_2()
804{
805    // nothing to do
806}
807
808UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
809    const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
810    uint32_t i;
811    int32_t bestConfidenceSoFar = -1;
812    for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
813        const int32_t *ngrams = ngrams_8859_2[i].ngrams;
814        const char    *lang   = ngrams_8859_2[i].lang;
815        int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
816        if (confidence > bestConfidenceSoFar) {
817            results->set(textIn, this, confidence, name, lang);
818            bestConfidenceSoFar = confidence;
819        }
820    }
821    return (bestConfidenceSoFar > 0);
822}
823
824const char *CharsetRecog_8859_2::getName() const
825{
826    return "ISO-8859-2";
827}
828
829
830CharsetRecog_8859_5::~CharsetRecog_8859_5()
831{
832    // nothing to do
833}
834
835const char *CharsetRecog_8859_5::getName() const
836{
837    return "ISO-8859-5";
838}
839
840CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
841{
842    // nothing to do
843}
844
845const char *CharsetRecog_8859_5_ru::getLanguage() const
846{
847    return "ru";
848}
849
850UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
851{
852    int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
853    results->set(textIn, this, confidence);
854    return (confidence > 0);
855}
856
857CharsetRecog_8859_6::~CharsetRecog_8859_6()
858{
859    // nothing to do
860}
861
862const char *CharsetRecog_8859_6::getName() const
863{
864    return "ISO-8859-6";
865}
866
867CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
868{
869    // nothing to do
870}
871
872const char *CharsetRecog_8859_6_ar::getLanguage() const
873{
874    return "ar";
875}
876
877UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
878{
879    int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
880    results->set(textIn, this, confidence);
881    return (confidence > 0);
882}
883
884CharsetRecog_8859_7::~CharsetRecog_8859_7()
885{
886    // nothing to do
887}
888
889const char *CharsetRecog_8859_7::getName() const
890{
891    return "ISO-8859-7";
892}
893
894CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
895{
896    // nothing to do
897}
898
899const char *CharsetRecog_8859_7_el::getLanguage() const
900{
901    return "el";
902}
903
904UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
905{
906    const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
907    int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
908    results->set(textIn, this, confidence, name, "el");
909    return (confidence > 0);
910}
911
912CharsetRecog_8859_8::~CharsetRecog_8859_8()
913{
914    // nothing to do
915}
916
917const char *CharsetRecog_8859_8::getName() const
918{
919    return "ISO-8859-8";
920}
921
922CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
923{
924    // nothing to do
925}
926
927const char *CharsetRecog_8859_8_I_he::getName() const
928{
929    return "ISO-8859-8-I";
930}
931
932const char *CharsetRecog_8859_8_I_he::getLanguage() const
933{
934    return "he";
935}
936
937UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
938{
939    const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
940    int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
941    results->set(textIn, this, confidence, name, "he");
942    return (confidence > 0);
943}
944
945CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
946{
947    // od ot gnihton
948}
949
950const char *CharsetRecog_8859_8_he::getLanguage() const
951{
952    return "he";
953}
954
955UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
956{
957    const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
958    int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
959    results->set(textIn, this, confidence, name, "he");
960    return (confidence > 0);
961}
962
963CharsetRecog_8859_9::~CharsetRecog_8859_9()
964{
965    // nothing to do
966}
967
968const char *CharsetRecog_8859_9::getName() const
969{
970    return "ISO-8859-9";
971}
972
973CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
974{
975    // nothing to do
976}
977
978const char *CharsetRecog_8859_9_tr::getLanguage() const
979{
980    return "tr";
981}
982
983UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
984{
985    const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
986    int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
987    results->set(textIn, this, confidence, name, "tr");
988    return (confidence > 0);
989}
990
991CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
992{
993    // nothing to do
994}
995
996const char *CharsetRecog_windows_1256::getName() const
997{
998    return  "windows-1256";
999}
1000
1001const char *CharsetRecog_windows_1256::getLanguage() const
1002{
1003    return "ar";
1004}
1005
1006UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1007{
1008    int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1009    results->set(textIn, this, confidence);
1010    return (confidence > 0);
1011}
1012
1013CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1014{
1015    // nothing to do
1016}
1017
1018const char *CharsetRecog_windows_1251::getName() const
1019{
1020    return  "windows-1251";
1021}
1022
1023const char *CharsetRecog_windows_1251::getLanguage() const
1024{
1025    return "ru";
1026}
1027
1028UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1029{
1030    int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1031    results->set(textIn, this, confidence);
1032    return (confidence > 0);
1033}
1034
1035CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1036{
1037    // nothing to do
1038}
1039
1040const char *CharsetRecog_KOI8_R::getName() const
1041{
1042    return  "KOI8-R";
1043}
1044
1045const char *CharsetRecog_KOI8_R::getLanguage() const
1046{
1047    return "ru";
1048}
1049
1050UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1051{
1052    int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1053    results->set(textIn, this, confidence);
1054    return (confidence > 0);
1055}
1056
1057CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1058{
1059    // nothing to do
1060}
1061
1062const char *CharsetRecog_IBM424_he::getLanguage() const
1063{
1064    return "he";
1065}
1066
1067CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1068{
1069    // nothing to do
1070}
1071
1072const char *CharsetRecog_IBM424_he_rtl::getName() const
1073{
1074    return  "IBM424_rtl";
1075}
1076
1077UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1078{
1079    int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1080    results->set(textIn, this, confidence);
1081    return (confidence > 0);
1082}
1083
1084CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1085{
1086    // nothing to do
1087}
1088
1089const char *CharsetRecog_IBM424_he_ltr::getName() const
1090{
1091    return  "IBM424_ltr";
1092}
1093
1094UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1095{
1096    int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1097    results->set(textIn, this, confidence);
1098    return (confidence > 0);
1099}
1100
1101static const uint8_t unshapeMap_IBM420[] = {
1102/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
1103/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1104/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1105/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1106/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1107/* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1108/* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1109/* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1110/* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1111/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
1112/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
1113/* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
1114/* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
1115/* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
1116/* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
1117/* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1118/* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
1119};
1120
1121CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1122{
1123    // nothing to do
1124}
1125
1126const char *CharsetRecog_IBM420_ar::getLanguage() const
1127{
1128    return "ar";
1129}
1130
1131void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
1132    prev_fInputBytesLength = textIn->fInputLen;
1133    prev_fInputBytes = textIn->fInputBytes;
1134
1135    int32_t length = 0;
1136    uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
1137
1138    if (bb != NULL) {
1139        textIn->fInputBytes = bb;
1140        textIn->fInputLen = length;
1141
1142        deleteBuffer = TRUE;
1143    } else {
1144        deleteBuffer = FALSE;
1145    }
1146}
1147
1148uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1149    uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
1150
1151    if (resultArray != NULL) {
1152        for (int32_t i = 0; i < inputBytesLength; i++) {
1153            resultArray[i] = unshapeMap_IBM420[resultArray[i]];
1154        }
1155    }
1156
1157    return resultArray;
1158}
1159
1160uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1161    int32_t bigBufferLength = inputBytesLength * 2;
1162    uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
1163    uint8_t *resultBuffer = NULL;
1164
1165    if (bigBuffer != NULL) {
1166        int32_t bufferIndex;
1167        static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
1168
1169        for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
1170            if (isLamAlef(inputBytes[i])) {
1171                bigBuffer[bufferIndex++] = unshapedLamAlef[0];
1172                bigBuffer[bufferIndex++] = unshapedLamAlef[1];
1173            } else {
1174                bigBuffer[bufferIndex++] = inputBytes[i];
1175            }
1176        }
1177
1178        length = bufferIndex;
1179        resultBuffer = (uint8_t *)uprv_malloc(length);
1180        if (resultBuffer != NULL) {
1181            uprv_memcpy(resultBuffer, bigBuffer, length);
1182        }
1183    }
1184
1185    if (bigBuffer != NULL) {
1186        uprv_free(bigBuffer);
1187    }
1188
1189    return resultBuffer;
1190}
1191
1192void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
1193    if (deleteBuffer) {
1194        uprv_free(textIn->fInputBytes);
1195
1196        textIn->fInputBytes = prev_fInputBytes;
1197        textIn->fInputLen = prev_fInputBytesLength;
1198    }
1199}
1200
1201UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
1202    static const uint8_t shapedLamAlef[] = {
1203        0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
1204    };
1205
1206    for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
1207        if (b == shapedLamAlef[i]) {
1208            return TRUE;
1209        }
1210    }
1211
1212    return FALSE;
1213}
1214
1215CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1216{
1217    // nothing to do
1218}
1219
1220const char *CharsetRecog_IBM420_ar_rtl::getName() const
1221{
1222    return  "IBM420_rtl";
1223}
1224
1225UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1226{
1227    int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1228    results->set(textIn, this, confidence);
1229    return (confidence > 0);
1230}
1231
1232CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1233{
1234    // nothing to do
1235}
1236
1237const char *CharsetRecog_IBM420_ar_ltr::getName() const
1238{
1239    return  "IBM420_ltr";
1240}
1241
1242UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1243{
1244    int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1245    results->set(textIn, this, confidence);
1246    return (confidence > 0);
1247}
1248
1249U_NAMESPACE_END
1250#endif
1251
1252