csrsbcs.h revision 1b7d32f919554dda9c193b32188251337bc756f1
1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#ifndef __CSRSBCS_H
9#define __CSRSBCS_H
10
11#include "unicode/uobject.h"
12
13#if !UCONFIG_NO_CONVERSION
14
15#include "csrecog.h"
16
17U_NAMESPACE_BEGIN
18
19class NGramParser : public UMemory
20{
21private:
22    int32_t ngram;
23    const int32_t *ngramList;
24
25    int32_t ngramCount;
26    int32_t hitCount;
27
28protected:
29	int32_t byteIndex;
30    const uint8_t *charMap;
31
32	void addByte(int32_t b);
33
34public:
35    NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
36    virtual ~NGramParser();
37
38private:
39    /*
40    * Binary search for value in table, which must have exactly 64 entries.
41    */
42    int32_t search(const int32_t *table, int32_t value);
43
44    void lookup(int32_t thisNgram);
45
46    virtual int32_t nextByte(InputText *det);
47	virtual void parseCharacters(InputText *det);
48
49public:
50    int32_t parse(InputText *det);
51
52};
53
54#if !UCONFIG_ONLY_HTML_CONVERSION
55class NGramParser_IBM420 : public NGramParser
56{
57private:
58	int32_t alef;
59	int32_t isLamAlef(int32_t b);
60	int32_t nextByte(InputText *det);
61	void parseCharacters(InputText *det);
62
63public:
64    NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
65};
66#endif
67
68
69class CharsetRecog_sbcs : public CharsetRecognizer
70{
71public:
72    CharsetRecog_sbcs();
73    virtual ~CharsetRecog_sbcs();
74    virtual const char *getName() const = 0;
75    virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
76    virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
77};
78
79class CharsetRecog_8859_1 : public CharsetRecog_sbcs
80{
81public:
82    virtual ~CharsetRecog_8859_1();
83    const char *getName() const;
84    virtual UBool match(InputText *det, CharsetMatch *results) const;
85};
86
87class CharsetRecog_8859_2 : public CharsetRecog_sbcs
88{
89public:
90    virtual ~CharsetRecog_8859_2();
91    const char *getName() const;
92    virtual UBool match(InputText *det, CharsetMatch *results) const;
93};
94
95class CharsetRecog_8859_5 : public CharsetRecog_sbcs
96{
97public:
98    virtual ~CharsetRecog_8859_5();
99    const char *getName() const;
100};
101
102class CharsetRecog_8859_6 : public CharsetRecog_sbcs
103{
104public:
105    virtual ~CharsetRecog_8859_6();
106
107    const char *getName() const;
108};
109
110class CharsetRecog_8859_7 : public CharsetRecog_sbcs
111{
112public:
113    virtual ~CharsetRecog_8859_7();
114
115    const char *getName() const;
116};
117
118class CharsetRecog_8859_8 : public CharsetRecog_sbcs
119{
120public:
121    virtual ~CharsetRecog_8859_8();
122
123    virtual const char *getName() const;
124};
125
126class CharsetRecog_8859_9 : public CharsetRecog_sbcs
127{
128public:
129    virtual ~CharsetRecog_8859_9();
130
131    const char *getName() const;
132};
133
134
135
136class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
137{
138public:
139    virtual ~CharsetRecog_8859_5_ru();
140
141    const char *getLanguage() const;
142
143    virtual UBool match(InputText *det, CharsetMatch *results) const;
144};
145
146class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
147{
148public:
149    virtual ~CharsetRecog_8859_6_ar();
150
151    const char *getLanguage() const;
152
153    virtual UBool match(InputText *det, CharsetMatch *results) const;
154};
155
156class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
157{
158public:
159    virtual ~CharsetRecog_8859_7_el();
160
161    const char *getLanguage() const;
162
163    virtual UBool match(InputText *det, CharsetMatch *results) const;
164};
165
166class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
167{
168public:
169    virtual ~CharsetRecog_8859_8_I_he();
170
171    const char *getName() const;
172
173    const char *getLanguage() const;
174
175    virtual UBool match(InputText *det, CharsetMatch *results) const;
176};
177
178class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
179{
180public:
181    virtual ~CharsetRecog_8859_8_he ();
182
183    const char *getLanguage() const;
184
185    virtual UBool match(InputText *det, CharsetMatch *results) const;
186};
187
188class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
189{
190public:
191    virtual ~CharsetRecog_8859_9_tr ();
192
193    const char *getLanguage() const;
194
195    virtual UBool match(InputText *det, CharsetMatch *results) const;
196};
197
198class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
199{
200public:
201    virtual ~CharsetRecog_windows_1256();
202
203    const char *getName() const;
204
205    const char *getLanguage() const;
206
207    virtual UBool match(InputText *det, CharsetMatch *results) const;
208};
209
210class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
211{
212public:
213    virtual ~CharsetRecog_windows_1251();
214
215    const char *getName() const;
216
217    const char *getLanguage() const;
218
219    virtual UBool match(InputText *det, CharsetMatch *results) const;
220};
221
222
223class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
224{
225public:
226    virtual ~CharsetRecog_KOI8_R();
227
228    const char *getName() const;
229
230    const char *getLanguage() const;
231
232    virtual UBool match(InputText *det, CharsetMatch *results) const;
233};
234
235#if !UCONFIG_ONLY_HTML_CONVERSION
236class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
237{
238public:
239    virtual ~CharsetRecog_IBM424_he();
240
241    const char *getLanguage() const;
242};
243
244class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
245public:
246    virtual ~CharsetRecog_IBM424_he_rtl();
247
248    const char *getName() const;
249
250    virtual UBool match(InputText *det, CharsetMatch *results) const;
251};
252
253class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
254    virtual ~CharsetRecog_IBM424_he_ltr();
255
256    const char *getName() const;
257
258    virtual UBool match(InputText *det, CharsetMatch *results) const;
259};
260
261class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
262{
263public:
264    virtual ~CharsetRecog_IBM420_ar();
265
266    const char *getLanguage() const;
267	int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
268
269};
270
271class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
272public:
273    virtual ~CharsetRecog_IBM420_ar_rtl();
274
275    const char *getName() const;
276
277    virtual UBool match(InputText *det, CharsetMatch *results) const;
278};
279
280class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
281    virtual ~CharsetRecog_IBM420_ar_ltr();
282
283    const char *getName() const;
284
285    virtual UBool match(InputText *det, CharsetMatch *results) const;
286};
287#endif
288
289U_NAMESPACE_END
290
291#endif /* !UCONFIG_NO_CONVERSION */
292#endif /* __CSRSBCS_H */
293