1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#ifndef __CSRSBCS_H
9#define __CSRSBCS_H
10
11#include "unicode/uobject.h"
12
13#if !UCONFIG_NO_CONVERSION
14
15#include "csrecog.h"
16
17U_NAMESPACE_BEGIN
18
19class NGramParser : public UMemory
20{
21private:
22    int32_t byteIndex;
23    int32_t ngram;
24
25    const int32_t *ngramList;
26    const uint8_t *charMap;
27
28    int32_t ngramCount;
29    int32_t hitCount;
30
31public:
32    NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
33
34private:
35    /*
36    * Binary search for value in table, which must have exactly 64 entries.
37    */
38    int32_t search(const int32_t *table, int32_t value);
39
40    void lookup(int32_t thisNgram);
41    void addByte(int32_t b);
42    int32_t nextByte(InputText *det);
43
44public:
45    int32_t parse(InputText *det);
46
47};
48
49
50class CharsetRecog_sbcs : public CharsetRecognizer
51{
52public:
53    CharsetRecog_sbcs();
54    virtual ~CharsetRecog_sbcs();
55    virtual const char *getName() const = 0;
56    virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
57    virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
58};
59
60class CharsetRecog_8859_1 : public CharsetRecog_sbcs
61{
62public:
63    virtual ~CharsetRecog_8859_1();
64    const char *getName() const;
65    virtual UBool match(InputText *det, CharsetMatch *results) const;
66};
67
68class CharsetRecog_8859_2 : public CharsetRecog_sbcs
69{
70public:
71    virtual ~CharsetRecog_8859_2();
72    const char *getName() const;
73    virtual UBool match(InputText *det, CharsetMatch *results) const;
74};
75
76class CharsetRecog_8859_5 : public CharsetRecog_sbcs
77{
78public:
79    virtual ~CharsetRecog_8859_5();
80    const char *getName() const;
81};
82
83class CharsetRecog_8859_6 : public CharsetRecog_sbcs
84{
85public:
86    virtual ~CharsetRecog_8859_6();
87
88    const char *getName() const;
89};
90
91class CharsetRecog_8859_7 : public CharsetRecog_sbcs
92{
93public:
94    virtual ~CharsetRecog_8859_7();
95
96    const char *getName() const;
97};
98
99class CharsetRecog_8859_8 : public CharsetRecog_sbcs
100{
101public:
102    virtual ~CharsetRecog_8859_8();
103
104    virtual const char *getName() const;
105};
106
107class CharsetRecog_8859_9 : public CharsetRecog_sbcs
108{
109public:
110    virtual ~CharsetRecog_8859_9();
111
112    const char *getName() const;
113};
114
115
116
117class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
118{
119public:
120    virtual ~CharsetRecog_8859_5_ru();
121
122    const char *getLanguage() const;
123
124    virtual UBool match(InputText *det, CharsetMatch *results) const;
125};
126
127class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
128{
129public:
130    virtual ~CharsetRecog_8859_6_ar();
131
132    const char *getLanguage() const;
133
134    virtual UBool match(InputText *det, CharsetMatch *results) const;
135};
136
137class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
138{
139public:
140    virtual ~CharsetRecog_8859_7_el();
141
142    const char *getLanguage() const;
143
144    virtual UBool match(InputText *det, CharsetMatch *results) const;
145};
146
147class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
148{
149public:
150    virtual ~CharsetRecog_8859_8_I_he();
151
152    const char *getName() const;
153
154    const char *getLanguage() const;
155
156    virtual UBool match(InputText *det, CharsetMatch *results) const;
157};
158
159class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
160{
161public:
162    virtual ~CharsetRecog_8859_8_he ();
163
164    const char *getLanguage() const;
165
166    virtual UBool match(InputText *det, CharsetMatch *results) const;
167};
168
169class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
170{
171public:
172    virtual ~CharsetRecog_8859_9_tr ();
173
174    const char *getLanguage() const;
175
176    virtual UBool match(InputText *det, CharsetMatch *results) const;
177};
178
179class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
180{
181public:
182    virtual ~CharsetRecog_windows_1256();
183
184    const char *getName() const;
185
186    const char *getLanguage() const;
187
188    virtual UBool match(InputText *det, CharsetMatch *results) const;
189};
190
191class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
192{
193public:
194    virtual ~CharsetRecog_windows_1251();
195
196    const char *getName() const;
197
198    const char *getLanguage() const;
199
200    virtual UBool match(InputText *det, CharsetMatch *results) const;
201};
202
203
204class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
205{
206public:
207    virtual ~CharsetRecog_KOI8_R();
208
209    const char *getName() const;
210
211    const char *getLanguage() const;
212
213    virtual UBool match(InputText *det, CharsetMatch *results) const;
214};
215
216class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
217{
218public:
219    virtual ~CharsetRecog_IBM424_he();
220
221    const char *getLanguage() const;
222};
223
224class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
225public:
226    virtual ~CharsetRecog_IBM424_he_rtl();
227
228    const char *getName() const;
229
230    virtual UBool match(InputText *det, CharsetMatch *results) const;
231};
232
233class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
234    virtual ~CharsetRecog_IBM424_he_ltr();
235
236    const char *getName() const;
237
238    virtual UBool match(InputText *det, CharsetMatch *results) const;
239};
240
241class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
242{
243public:
244    virtual ~CharsetRecog_IBM420_ar();
245
246    const char *getLanguage() const;
247
248protected:
249    void matchInit(InputText *textIn);
250    void matchFinish(InputText *textIn);
251
252private:
253    uint8_t *prev_fInputBytes;
254    int32_t prev_fInputBytesLength;
255    UBool deleteBuffer;
256
257    UBool isLamAlef(uint8_t b);
258    uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
259    uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
260};
261
262class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
263public:
264    virtual ~CharsetRecog_IBM420_ar_rtl();
265
266    const char *getName() const;
267
268    virtual UBool match(InputText *det, CharsetMatch *results) const;
269};
270
271class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
272    virtual ~CharsetRecog_IBM420_ar_ltr();
273
274    const char *getName() const;
275
276    virtual UBool match(InputText *det, CharsetMatch *results) const;
277};
278
279U_NAMESPACE_END
280
281#endif /* !UCONFIG_NO_CONVERSION */
282#endif /* __CSRSBCS_H */
283