1/* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8#ifndef __CSRSBCS_H 9#define __CSRSBCS_H 10 11#include "unicode/uobject.h" 12 13#if !UCONFIG_NO_CONVERSION 14 15#include "csrecog.h" 16 17U_NAMESPACE_BEGIN 18 19class NGramParser : public UMemory 20{ 21private: 22 int32_t byteIndex; 23 int32_t ngram; 24 25 const int32_t *ngramList; 26 const uint8_t *charMap; 27 28 int32_t ngramCount; 29 int32_t hitCount; 30 31public: 32 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 33 34private: 35 /* 36 * Binary search for value in table, which must have exactly 64 entries. 37 */ 38 int32_t search(const int32_t *table, int32_t value); 39 40 void lookup(int32_t thisNgram); 41 void addByte(int32_t b); 42 int32_t nextByte(InputText *det); 43 44public: 45 int32_t parse(InputText *det); 46 47}; 48 49 50class CharsetRecog_sbcs : public CharsetRecognizer 51{ 52public: 53 CharsetRecog_sbcs(); 54 virtual ~CharsetRecog_sbcs(); 55 virtual const char *getName() const = 0; 56 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 57 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 58}; 59 60class CharsetRecog_8859_1 : public CharsetRecog_sbcs 61{ 62public: 63 virtual ~CharsetRecog_8859_1(); 64 const char *getName() const; 65 virtual UBool match(InputText *det, CharsetMatch *results) const; 66}; 67 68class CharsetRecog_8859_2 : public CharsetRecog_sbcs 69{ 70public: 71 virtual ~CharsetRecog_8859_2(); 72 const char *getName() const; 73 virtual UBool match(InputText *det, CharsetMatch *results) const; 74}; 75 76class CharsetRecog_8859_5 : public CharsetRecog_sbcs 77{ 78public: 79 virtual ~CharsetRecog_8859_5(); 80 const char *getName() const; 81}; 82 83class CharsetRecog_8859_6 : public CharsetRecog_sbcs 84{ 85public: 86 virtual ~CharsetRecog_8859_6(); 87 88 const char *getName() const; 89}; 90 91class CharsetRecog_8859_7 : public CharsetRecog_sbcs 92{ 93public: 94 virtual ~CharsetRecog_8859_7(); 95 96 const char *getName() const; 97}; 98 99class CharsetRecog_8859_8 : public CharsetRecog_sbcs 100{ 101public: 102 virtual ~CharsetRecog_8859_8(); 103 104 virtual const char *getName() const; 105}; 106 107class CharsetRecog_8859_9 : public CharsetRecog_sbcs 108{ 109public: 110 virtual ~CharsetRecog_8859_9(); 111 112 const char *getName() const; 113}; 114 115 116 117class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 118{ 119public: 120 virtual ~CharsetRecog_8859_5_ru(); 121 122 const char *getLanguage() const; 123 124 virtual UBool match(InputText *det, CharsetMatch *results) const; 125}; 126 127class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 128{ 129public: 130 virtual ~CharsetRecog_8859_6_ar(); 131 132 const char *getLanguage() const; 133 134 virtual UBool match(InputText *det, CharsetMatch *results) const; 135}; 136 137class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 138{ 139public: 140 virtual ~CharsetRecog_8859_7_el(); 141 142 const char *getLanguage() const; 143 144 virtual UBool match(InputText *det, CharsetMatch *results) const; 145}; 146 147class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 148{ 149public: 150 virtual ~CharsetRecog_8859_8_I_he(); 151 152 const char *getName() const; 153 154 const char *getLanguage() const; 155 156 virtual UBool match(InputText *det, CharsetMatch *results) const; 157}; 158 159class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 160{ 161public: 162 virtual ~CharsetRecog_8859_8_he (); 163 164 const char *getLanguage() const; 165 166 virtual UBool match(InputText *det, CharsetMatch *results) const; 167}; 168 169class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 170{ 171public: 172 virtual ~CharsetRecog_8859_9_tr (); 173 174 const char *getLanguage() const; 175 176 virtual UBool match(InputText *det, CharsetMatch *results) const; 177}; 178 179class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 180{ 181public: 182 virtual ~CharsetRecog_windows_1256(); 183 184 const char *getName() const; 185 186 const char *getLanguage() const; 187 188 virtual UBool match(InputText *det, CharsetMatch *results) const; 189}; 190 191class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 192{ 193public: 194 virtual ~CharsetRecog_windows_1251(); 195 196 const char *getName() const; 197 198 const char *getLanguage() const; 199 200 virtual UBool match(InputText *det, CharsetMatch *results) const; 201}; 202 203 204class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 205{ 206public: 207 virtual ~CharsetRecog_KOI8_R(); 208 209 const char *getName() const; 210 211 const char *getLanguage() const; 212 213 virtual UBool match(InputText *det, CharsetMatch *results) const; 214}; 215 216class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 217{ 218public: 219 virtual ~CharsetRecog_IBM424_he(); 220 221 const char *getLanguage() const; 222}; 223 224class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 225public: 226 virtual ~CharsetRecog_IBM424_he_rtl(); 227 228 const char *getName() const; 229 230 virtual UBool match(InputText *det, CharsetMatch *results) const; 231}; 232 233class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 234 virtual ~CharsetRecog_IBM424_he_ltr(); 235 236 const char *getName() const; 237 238 virtual UBool match(InputText *det, CharsetMatch *results) const; 239}; 240 241class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 242{ 243public: 244 virtual ~CharsetRecog_IBM420_ar(); 245 246 const char *getLanguage() const; 247 248protected: 249 void matchInit(InputText *textIn); 250 void matchFinish(InputText *textIn); 251 252private: 253 uint8_t *prev_fInputBytes; 254 int32_t prev_fInputBytesLength; 255 UBool deleteBuffer; 256 257 UBool isLamAlef(uint8_t b); 258 uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); 259 uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); 260}; 261 262class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 263public: 264 virtual ~CharsetRecog_IBM420_ar_rtl(); 265 266 const char *getName() const; 267 268 virtual UBool match(InputText *det, CharsetMatch *results) const; 269}; 270 271class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 272 virtual ~CharsetRecog_IBM420_ar_ltr(); 273 274 const char *getName() const; 275 276 virtual UBool match(InputText *det, CharsetMatch *results) const; 277}; 278 279U_NAMESPACE_END 280 281#endif /* !UCONFIG_NO_CONVERSION */ 282#endif /* __CSRSBCS_H */ 283