10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 51b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert * Copyright (C) 2005-2015, International Business Machines 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef __CSR2022_H 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define __CSR2022_H 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrecog.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 2154dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass CharsetMatch; 2254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * class CharsetRecog_2022 part of the ICU charset detection imlementation. 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This is a superclass for the individual detectors for 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * each of the detectable members of the ISO 2022 family 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of encodings. 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The separate classes are nested within this class. 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @internal 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022 : public CharsetRecognizer 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~CharsetRecog_2022() = 0; 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprotected: 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Matching function shared among the 2022 detectors JP, CN and KR 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Counts up the number of legal an unrecognized escape sequences in 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the sample of text, and computes a score based on the total number & 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the proportion that fit the encoding. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param text the byte buffer containing text to analyse 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param textLen the size of the text in the byte. 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param escapeSequences the byte escape sequences to test for. 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return match quality, in the range of 0-100. 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 5354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t match_2022(const uint8_t *text, 5454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t textLen, 5554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const uint8_t escapeSequences[][5], 5654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t escapeSequences_length) const; 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022JP :public CharsetRecog_2022 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 62103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliuspublic: 63103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius virtual ~CharsetRecog_2022JP(); 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 6754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText *textIn, CharsetMatch *results) const; 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#if !UCONFIG_ONLY_HTML_CONVERSION 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022KR :public CharsetRecog_2022 { 72103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliuspublic: 73103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius virtual ~CharsetRecog_2022KR(); 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *getName() const; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 7754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText *textIn, CharsetMatch *results) const; 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass CharsetRecog_2022CN :public CharsetRecog_2022 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 83103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliuspublic: 84103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius virtual ~CharsetRecog_2022CN(); 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char* getName() const; 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool match(InputText *textIn, CharsetMatch *results) const; 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* __CSR2022_H */ 96