10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others.
264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
51b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert *   Copyright (C) 2005-2015, International Business Machines
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef __CSR2022_H
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define __CSR2022_H
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrecog.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
2154dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass CharsetMatch;
2254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *                           This is a superclass for the individual detectors for
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *                           each of the detectable members of the ISO 2022 family
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *                           of encodings.
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *                           The separate classes are nested within this class.
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @internal
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022 : public CharsetRecognizer
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~CharsetRecog_2022() = 0;
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprotected:
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Matching function shared among the 2022 detectors JP, CN and KR
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Counts up the number of legal an unrecognized escape sequences in
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * the sample of text, and computes a score based on the total number &
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * the proportion that fit the encoding.
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param text the byte buffer containing text to analyse
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param textLen  the size of the text in the byte.
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param escapeSequences the byte escape sequences to test for.
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return match quality, in the range of 0-100.
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
5354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t match_2022(const uint8_t *text,
5454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                       int32_t textLen,
5554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                       const uint8_t escapeSequences[][5],
5654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                       int32_t escapeSequences_length) const;
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022JP :public CharsetRecog_2022
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
62103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliuspublic:
63103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    virtual ~CharsetRecog_2022JP();
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
6754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText *textIn, CharsetMatch *results) const;
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#if !UCONFIG_ONLY_HTML_CONVERSION
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022KR :public CharsetRecog_2022 {
72103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliuspublic:
73103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    virtual ~CharsetRecog_2022KR();
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *getName() const;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
7754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText *textIn, CharsetMatch *results) const;
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022CN :public CharsetRecog_2022
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
83103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliuspublic:
84103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    virtual ~CharsetRecog_2022CN();
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char* getName() const;
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool match(InputText *textIn, CharsetMatch *results) const;
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* __CSR2022_H */
96