1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius *   Copyright (C) 2005-2012, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef __CSRMBCS_H
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define __CSRMBCS_H
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrecog.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// "Character"  iterated character class.
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    Recognizers for specific mbcs encodings make their "characters" available
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    by providing a nextChar() function that fills in an instance of IteratedChar
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    with the next char from the input.
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    The returned characters are not converted to Unicode, but remain as the raw
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    bytes (concatenated into an int) from the codepage data.
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  For Asian charsets, use the raw input rather than the input that has been
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//   stripped of markup.  Detection only considers multi-byte chars, effectively
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//   stripping markup anyway, and double byte chars do occur in markup too.
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass IteratedChar : public UMemory
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t charValue;             // 1-4 bytes from the raw input data
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t  index;
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t  nextIndex;
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool    error;
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool    done;
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    IteratedChar();
4185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    //void reset();
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t nextByte(InputText* det);
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_mbcs : public CharsetRecognizer {
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprotected:
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Test the match of this charset with the input text data
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *      which is obtained via the CharsetDetector object.
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param det  The CharsetDetector, which contains the input text
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *             to be checked for being in this charset.
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return     Two values packed into one int  (Damn java, anyhow)
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *             <br/>
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *             bits 0-7:  the match confidence, ranging from 0-100
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *             <br/>
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *             bits 8-15: The match reason, an enum-like value.
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
6154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_mbcs();
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Get the IANA name of this charset.
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return the charset name.
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const = 0;
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const = 0;
7454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const = 0;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Get the next character (however many bytes it is) from the input data
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *    Subclasses for specific charset encodings must implement this function
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *    to get characters according to the rules of their encoding scheme.
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *  This function is not a method of class IteratedChar only because
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *   that would require a lot of extra derived classes, which is awkward.
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param it  The IteratedChar "struct" into which the returned char is placed.
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param det The charset detector, which is needed to get at the input byte data
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *            being iterated over.
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return    True if a character was returned, false at end of input.
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Shift-JIS charset recognizer.
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_sjis : public CharsetRecog_mbcs {
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_sjis();
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool nextChar(IteratedChar *it, InputText *det) const;
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const;
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const;
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   EUC charset recognizers.  One abstract class that provides the common function
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *             for getting the next character according to the EUC encoding scheme,
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_euc : public CharsetRecog_mbcs
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_euc();
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const = 0;
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const = 0;
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const = 0;
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *  (non-Javadoc)
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *  Get the next character value for EUC based encodings.
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *  Character "value" is simply the raw bytes that make up the character
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *     packed into an int.
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
13254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool nextChar(IteratedChar *it, InputText *det) const;
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The charset recognize for EUC-JP.  A singleton instance of this class
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *    is created and kept by the public CharsetDetector class
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_euc_jp : public CharsetRecog_euc
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_euc_jp();
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const;
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const;
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The charset recognize for EUC-KR.  A singleton instance of this class
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *    is created and kept by the public CharsetDetector class
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_euc_kr : public CharsetRecog_euc
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_euc_kr();
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const;
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
16254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const;
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Big5 charset recognizer.
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_big5 : public CharsetRecog_mbcs
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_big5();
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool nextChar(IteratedChar* it, InputText* det) const;
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const;
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
18054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const;
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   GB-18030 recognizer. Uses simplified Chinese statistics.
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_gb_18030 : public CharsetRecog_mbcs
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_gb_18030();
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool nextChar(IteratedChar* it, InputText* det) const;
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getLanguage() const;
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText* input, CharsetMatch *results) const;
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* __CSRMBCS_H */
206