1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Copyright (C) 2005-2012, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef __CSRMBCS_H 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define __CSRMBCS_H 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrecog.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// "Character" iterated character class. 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Recognizers for specific mbcs encodings make their "characters" available 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// by providing a nextChar() function that fills in an instance of IteratedChar 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// with the next char from the input. 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// The returned characters are not converted to Unicode, but remain as the raw 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// bytes (concatenated into an int) from the codepage data. 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// For Asian charsets, use the raw input rather than the input that has been 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// stripped of markup. Detection only considers multi-byte chars, effectively 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// stripping markup anyway, and double byte chars do occur in markup too. 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass IteratedChar : public UMemory 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t charValue; // 1-4 bytes from the raw input data 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t index; 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextIndex; 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool error; 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool done; 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru IteratedChar(); 4185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho //void reset(); 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextByte(InputText* det); 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_mbcs : public CharsetRecognizer { 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprotected: 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Test the match of this charset with the input text data 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * which is obtained via the CharsetDetector object. 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param det The CharsetDetector, which contains the input text 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to be checked for being in this charset. 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return Two values packed into one int (Damn java, anyhow) 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <br/> 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * bits 0-7: the match confidence, ranging from 0-100 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <br/> 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * bits 8-15: The match reason, an enum-like value. 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 6154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_mbcs(); 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Get the IANA name of this charset. 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return the charset name. 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const = 0; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const = 0; 7454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const = 0; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Get the next character (however many bytes it is) from the input data 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Subclasses for specific charset encodings must implement this function 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to get characters according to the rules of their encoding scheme. 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This function is not a method of class IteratedChar only because 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that would require a lot of extra derived classes, which is awkward. 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param it The IteratedChar "struct" into which the returned char is placed. 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param det The charset detector, which is needed to get at the input byte data 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * being iterated over. 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return True if a character was returned, false at end of input. 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Shift-JIS charset recognizer. 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_sjis : public CharsetRecog_mbcs { 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_sjis(); 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool nextChar(IteratedChar *it, InputText *det) const; 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const; 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const; 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * EUC charset recognizers. One abstract class that provides the common function 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for getting the next character according to the EUC encoding scheme, 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_euc : public CharsetRecog_mbcs 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_euc(); 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const = 0; 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const = 0; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const = 0; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (non-Javadoc) 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Get the next character value for EUC based encodings. 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Character "value" is simply the raw bytes that make up the character 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * packed into an int. 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 13254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool nextChar(IteratedChar *it, InputText *det) const; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The charset recognize for EUC-JP. A singleton instance of this class 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * is created and kept by the public CharsetDetector class 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_euc_jp : public CharsetRecog_euc 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_euc_jp(); 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const; 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The charset recognize for EUC-KR. A singleton instance of this class 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * is created and kept by the public CharsetDetector class 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_euc_kr : public CharsetRecog_euc 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_euc_kr(); 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const; 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 16254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const; 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Big5 charset recognizer. 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_big5 : public CharsetRecog_mbcs 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_big5(); 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool nextChar(IteratedChar* it, InputText* det) const; 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const; 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const; 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * GB-18030 recognizer. Uses simplified Chinese statistics. 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_gb_18030 : public CharsetRecog_mbcs 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_gb_18030(); 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool nextChar(IteratedChar* it, InputText* det) const; 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getLanguage() const; 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText* input, CharsetMatch *results) const; 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* __CSRMBCS_H */ 206