csrsbcs.h revision 1b7d32f919554dda9c193b32188251337bc756f1
1/* 2 ********************************************************************** 3 * Copyright (C) 2005-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8#ifndef __CSRSBCS_H 9#define __CSRSBCS_H 10 11#include "unicode/uobject.h" 12 13#if !UCONFIG_NO_CONVERSION 14 15#include "csrecog.h" 16 17U_NAMESPACE_BEGIN 18 19class NGramParser : public UMemory 20{ 21private: 22 int32_t ngram; 23 const int32_t *ngramList; 24 25 int32_t ngramCount; 26 int32_t hitCount; 27 28protected: 29 int32_t byteIndex; 30 const uint8_t *charMap; 31 32 void addByte(int32_t b); 33 34public: 35 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 36 virtual ~NGramParser(); 37 38private: 39 /* 40 * Binary search for value in table, which must have exactly 64 entries. 41 */ 42 int32_t search(const int32_t *table, int32_t value); 43 44 void lookup(int32_t thisNgram); 45 46 virtual int32_t nextByte(InputText *det); 47 virtual void parseCharacters(InputText *det); 48 49public: 50 int32_t parse(InputText *det); 51 52}; 53 54#if !UCONFIG_ONLY_HTML_CONVERSION 55class NGramParser_IBM420 : public NGramParser 56{ 57private: 58 int32_t alef; 59 int32_t isLamAlef(int32_t b); 60 int32_t nextByte(InputText *det); 61 void parseCharacters(InputText *det); 62 63public: 64 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); 65}; 66#endif 67 68 69class CharsetRecog_sbcs : public CharsetRecognizer 70{ 71public: 72 CharsetRecog_sbcs(); 73 virtual ~CharsetRecog_sbcs(); 74 virtual const char *getName() const = 0; 75 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 76 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 77}; 78 79class CharsetRecog_8859_1 : public CharsetRecog_sbcs 80{ 81public: 82 virtual ~CharsetRecog_8859_1(); 83 const char *getName() const; 84 virtual UBool match(InputText *det, CharsetMatch *results) const; 85}; 86 87class CharsetRecog_8859_2 : public CharsetRecog_sbcs 88{ 89public: 90 virtual ~CharsetRecog_8859_2(); 91 const char *getName() const; 92 virtual UBool match(InputText *det, CharsetMatch *results) const; 93}; 94 95class CharsetRecog_8859_5 : public CharsetRecog_sbcs 96{ 97public: 98 virtual ~CharsetRecog_8859_5(); 99 const char *getName() const; 100}; 101 102class CharsetRecog_8859_6 : public CharsetRecog_sbcs 103{ 104public: 105 virtual ~CharsetRecog_8859_6(); 106 107 const char *getName() const; 108}; 109 110class CharsetRecog_8859_7 : public CharsetRecog_sbcs 111{ 112public: 113 virtual ~CharsetRecog_8859_7(); 114 115 const char *getName() const; 116}; 117 118class CharsetRecog_8859_8 : public CharsetRecog_sbcs 119{ 120public: 121 virtual ~CharsetRecog_8859_8(); 122 123 virtual const char *getName() const; 124}; 125 126class CharsetRecog_8859_9 : public CharsetRecog_sbcs 127{ 128public: 129 virtual ~CharsetRecog_8859_9(); 130 131 const char *getName() const; 132}; 133 134 135 136class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 137{ 138public: 139 virtual ~CharsetRecog_8859_5_ru(); 140 141 const char *getLanguage() const; 142 143 virtual UBool match(InputText *det, CharsetMatch *results) const; 144}; 145 146class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 147{ 148public: 149 virtual ~CharsetRecog_8859_6_ar(); 150 151 const char *getLanguage() const; 152 153 virtual UBool match(InputText *det, CharsetMatch *results) const; 154}; 155 156class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 157{ 158public: 159 virtual ~CharsetRecog_8859_7_el(); 160 161 const char *getLanguage() const; 162 163 virtual UBool match(InputText *det, CharsetMatch *results) const; 164}; 165 166class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 167{ 168public: 169 virtual ~CharsetRecog_8859_8_I_he(); 170 171 const char *getName() const; 172 173 const char *getLanguage() const; 174 175 virtual UBool match(InputText *det, CharsetMatch *results) const; 176}; 177 178class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 179{ 180public: 181 virtual ~CharsetRecog_8859_8_he (); 182 183 const char *getLanguage() const; 184 185 virtual UBool match(InputText *det, CharsetMatch *results) const; 186}; 187 188class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 189{ 190public: 191 virtual ~CharsetRecog_8859_9_tr (); 192 193 const char *getLanguage() const; 194 195 virtual UBool match(InputText *det, CharsetMatch *results) const; 196}; 197 198class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 199{ 200public: 201 virtual ~CharsetRecog_windows_1256(); 202 203 const char *getName() const; 204 205 const char *getLanguage() const; 206 207 virtual UBool match(InputText *det, CharsetMatch *results) const; 208}; 209 210class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 211{ 212public: 213 virtual ~CharsetRecog_windows_1251(); 214 215 const char *getName() const; 216 217 const char *getLanguage() const; 218 219 virtual UBool match(InputText *det, CharsetMatch *results) const; 220}; 221 222 223class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 224{ 225public: 226 virtual ~CharsetRecog_KOI8_R(); 227 228 const char *getName() const; 229 230 const char *getLanguage() const; 231 232 virtual UBool match(InputText *det, CharsetMatch *results) const; 233}; 234 235#if !UCONFIG_ONLY_HTML_CONVERSION 236class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 237{ 238public: 239 virtual ~CharsetRecog_IBM424_he(); 240 241 const char *getLanguage() const; 242}; 243 244class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 245public: 246 virtual ~CharsetRecog_IBM424_he_rtl(); 247 248 const char *getName() const; 249 250 virtual UBool match(InputText *det, CharsetMatch *results) const; 251}; 252 253class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 254 virtual ~CharsetRecog_IBM424_he_ltr(); 255 256 const char *getName() const; 257 258 virtual UBool match(InputText *det, CharsetMatch *results) const; 259}; 260 261class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 262{ 263public: 264 virtual ~CharsetRecog_IBM420_ar(); 265 266 const char *getLanguage() const; 267 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 268 269}; 270 271class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 272public: 273 virtual ~CharsetRecog_IBM420_ar_rtl(); 274 275 const char *getName() const; 276 277 virtual UBool match(InputText *det, CharsetMatch *results) const; 278}; 279 280class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 281 virtual ~CharsetRecog_IBM420_ar_ltr(); 282 283 const char *getName() const; 284 285 virtual UBool match(InputText *det, CharsetMatch *results) const; 286}; 287#endif 288 289U_NAMESPACE_END 290 291#endif /* !UCONFIG_NO_CONVERSION */ 292#endif /* __CSRSBCS_H */ 293