1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ********************************************************************** 31b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert * Copyright (C) 2005-2015, International Business Machines 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ********************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucsdet.h" 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucnv.h" 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/unistr.h" 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/putil.h" 14b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uniset.h" 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "intltest.h" 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "csdetest.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "xmlparser.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdlib.h> 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <string.h> 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdio.h> 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define CH_SPACE 0x0020 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define CH_SLASH 0x002F 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#define TEST_ASSERT(x) {if (!(x)) { \ 3754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 3854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 3954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 4054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\ 4154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return;}} 4254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 4354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------------------------------------- 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Test class boilerplate 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------------------------------------- 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCharsetDetectionTest::CharsetDetectionTest() 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCharsetDetectionTest::~CharsetDetectionTest() 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) logln("TestSuite CharsetDetectionTest: "); 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (index) { 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0: name = "ConstructionTest"; 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) ConstructionTest(); 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 1: name = "UTF8Test"; 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) UTF8Test(); 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 2: name = "UTF16Test"; 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) UTF16Test(); 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 3: name = "C1BytesTest"; 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) C1BytesTest(); 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 4: name = "InputFilterTest"; 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) InputFilterTest(); 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 5: name = "DetectionTest"; 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) DetectionTest(); 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_LEGACY_CONVERSION 88b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru case 6: name = "IBM424Test"; 89b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (exec) IBM424Test(); 90b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru break; 91b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 92b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru case 7: name = "IBM420Test"; 93b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (exec) IBM420Test(); 94b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru break; 9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else 9650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case 6: 9750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case 7: name = "skip"; break; 9850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 99b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru case 8: name = "Ticket6394Test"; 100b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (exec) Ticket6394Test(); 101b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru break; 102b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 10354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius case 9: name = "Ticket6954Test"; 10454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (exec) Ticket6954Test(); 10554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius break; 10654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: name = ""; 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; //needed to end loop 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = -1; 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru splits = 1; 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while((offset = src.indexOf(ch, offset + 1)) >= 0) { 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru splits += 1; 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *result = new UnicodeString[splits]; 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = 0; 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t split = 0; 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end; 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while((end = src.indexOf(ch, start)) >= 0) { 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.extractBetween(start, end, result[split++]); 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru start = end + 1; 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru src.extractBetween(start, src.length(), result[split]); 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return result; 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sLength = source.length(); 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *bytes = NULL; 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru length = source.extract(0, sLength, NULL, codepage); 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (length > 0) { 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bytes = NEW_ARRAY(char, length + 1); 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru source.extract(0, sLength, bytes, codepage); 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return bytes; 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic void freeBytes(char *bytes) 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru DELETE_ARRAY(bytes); 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t splits = 0; 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t testLength = testString.length(); 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *eSplit = split(encoding, CH_SLASH, splits); 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t cpLength = eSplit[0].length(); 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char codepage[64]; 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru codepage[cpLength] = '\0'; 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t byteLength = 0; 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *bytes = extractBytes(testString, codepage, byteLength); 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bytes == NULL) { 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_LEGACY_CONVERSION 176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho dataerrln("Can't open a " + encoding + " converter for " + id); 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 18150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ucsdet_setText(csd.getAlias(), bytes, byteLength, &status); 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t matchCount = 0; 18450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status); 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString name(ucsdet_getName(matches[0], &status)); 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *decoded = NULL; 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dLength = 0; 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (matchCount == 0) { 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (name.compare(eSplit[0]) != 0) { 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (int32_t m = 0; m < matchCount; m += 1) { 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *name = ucsdet_getName(matches[m], &status); 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *lang = ucsdet_getLanguage(matches[m], &status); 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t confidence = ucsdet_getConfidence(matches[m], &status); 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("%s (%s) %d\n", name, lang, confidence); 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (splits > 1 && lang.compare(eSplit[1]) != 0) { 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru decoded = NEW_ARRAY(UChar, testLength); 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(decoded, dLength) != 0) { 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(int32_t i = 0; i < testLength; i += 1) { 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(testString[i] != decoded[i]) { 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("Strings differ at byte %d\n", i); 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru DELETE_ARRAY(decoded); 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail: 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(bytes); 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] eSplit; 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *testDataDirectory = IntlTest::getSourceTestData(status); 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("ERROR: getPath() failed - %s", u_errorName(status)); 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strcpy(buffer, testDataDirectory); 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strcat(buffer, filename); 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return buffer; 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::ConstructionTest() 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 25750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IcuTestErrorCode status(*this, "ConstructionTest"); 25850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho LocalUCharsetDetectorPointer csd(ucsdet_open(status)); 25950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status)); 26050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t count = uenum_count(e.getAlias(), status); 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("There are %d recognizers.\n", count); 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(int32_t i = 0; i < count; i += 1) { 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t length; 26850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const char *name = uenum_next(e.getAlias(), &length, status); 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(name == NULL || length <= 0) { 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("%s\n", name); 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 27859d709d503bab6e2b61931737e662dd293b40578ccornelius 27959d709d503bab6e2b61931737e662dd293b40578ccornelius const char* defDisabled[] = { 28059d709d503bab6e2b61931737e662dd293b40578ccornelius "IBM420_rtl", "IBM420_ltr", 28159d709d503bab6e2b61931737e662dd293b40578ccornelius "IBM424_rtl", "IBM424_ltr", 28259d709d503bab6e2b61931737e662dd293b40578ccornelius 0 28359d709d503bab6e2b61931737e662dd293b40578ccornelius }; 28459d709d503bab6e2b61931737e662dd293b40578ccornelius 28559d709d503bab6e2b61931737e662dd293b40578ccornelius LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status)); 28659d709d503bab6e2b61931737e662dd293b40578ccornelius const char *activeName = NULL; 28759d709d503bab6e2b61931737e662dd293b40578ccornelius 28859d709d503bab6e2b61931737e662dd293b40578ccornelius while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) { 28959d709d503bab6e2b61931737e662dd293b40578ccornelius // the charset must be included in all list 29059d709d503bab6e2b61931737e662dd293b40578ccornelius UBool found = FALSE; 29159d709d503bab6e2b61931737e662dd293b40578ccornelius 29259d709d503bab6e2b61931737e662dd293b40578ccornelius const char *name = NULL; 29359d709d503bab6e2b61931737e662dd293b40578ccornelius uenum_reset(e.getAlias(), status); 29459d709d503bab6e2b61931737e662dd293b40578ccornelius while ((name = uenum_next(e.getAlias(), NULL, status))) { 29559d709d503bab6e2b61931737e662dd293b40578ccornelius if (strcmp(activeName, name) == 0) { 29659d709d503bab6e2b61931737e662dd293b40578ccornelius found = TRUE; 29759d709d503bab6e2b61931737e662dd293b40578ccornelius break; 29859d709d503bab6e2b61931737e662dd293b40578ccornelius } 29959d709d503bab6e2b61931737e662dd293b40578ccornelius } 30059d709d503bab6e2b61931737e662dd293b40578ccornelius 30159d709d503bab6e2b61931737e662dd293b40578ccornelius if (!found) { 30259d709d503bab6e2b61931737e662dd293b40578ccornelius errln(UnicodeString(activeName) + " is not included in the all charset list."); 30359d709d503bab6e2b61931737e662dd293b40578ccornelius } 30459d709d503bab6e2b61931737e662dd293b40578ccornelius 30559d709d503bab6e2b61931737e662dd293b40578ccornelius // some charsets are disabled by default 30659d709d503bab6e2b61931737e662dd293b40578ccornelius found = FALSE; 30759d709d503bab6e2b61931737e662dd293b40578ccornelius for (int32_t i = 0; defDisabled[i] != 0; i++) { 30859d709d503bab6e2b61931737e662dd293b40578ccornelius if (strcmp(activeName, defDisabled[i]) == 0) { 30959d709d503bab6e2b61931737e662dd293b40578ccornelius found = TRUE; 31059d709d503bab6e2b61931737e662dd293b40578ccornelius break; 31159d709d503bab6e2b61931737e662dd293b40578ccornelius } 31259d709d503bab6e2b61931737e662dd293b40578ccornelius } 31359d709d503bab6e2b61931737e662dd293b40578ccornelius if (found) { 31459d709d503bab6e2b61931737e662dd293b40578ccornelius errln(UnicodeString(activeName) + " should not be included in the default charset list."); 31559d709d503bab6e2b61931737e662dd293b40578ccornelius } 31659d709d503bab6e2b61931737e662dd293b40578ccornelius } 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::UTF8Test() 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ss = "This is a string with some non-ascii characters that will " 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "be converted to UTF-8, then shoved through the detection process. " 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0391\\u0392\\u0393\\u0394\\u0395" 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Sure would be nice if our source could contain Unicode directly!"; 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString s = ss.unescape(); 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t byteLength = 0, sLength = s.length(); 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *bytes = extractBytes(s, "UTF-8", byteLength); 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCharsetMatch *match; 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *detected = NEW_ARRAY(UChar, sLength); 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, bytes, byteLength, &status); 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Detection failure for UTF-8: got no matches."); 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_getUChars(match, detected, sLength, &status); 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (s.compare(detected, sLength) != 0) { 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Round-trip test failed!"); 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail: 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru DELETE_ARRAY(detected); 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(bytes); 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_close(csd); 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::UTF16Test() 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Notice the BOM on the start of this string */ 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar chars[] = { 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x064a, 0x062a, 0x0000}; 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString s(chars); 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t beLength = 0, leLength = 0; 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *beBytes = extractBytes(s, "UTF-16BE", beLength); 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *leBytes = extractBytes(s, "UTF-16LE", leLength); 369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCharsetMatch *match; 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *name; 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t conf; 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, beBytes, beLength, &status); 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Encoding detection failure for UTF-16BE: got no matches."); 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto try_le; 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name = ucsdet_getName(match, &status); 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru conf = ucsdet_getConfidence(match, &status); 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strcmp(name, "UTF-16BE") != 0) { 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Encoding detection failure for UTF-16BE: got %s", name); 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto try_le; // no point in looking at confidence if we got the wrong character set. 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (conf != 100) { 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querutry_le: 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, leBytes, leLength, &status); 396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Encoding detection failure for UTF-16LE: got no matches."); 400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name = ucsdet_getName(match, &status); 404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru conf = ucsdet_getConfidence(match, &status); 405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strcmp(name, "UTF-16LE") != 0) { 408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Enconding detection failure for UTF-16LE: got %s", name); 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; // no point in looking at confidence if we got the wrong character set. 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (conf != 100) { 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail: 417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(leBytes); 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(beBytes); 419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_close(csd); 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::InputFilterTest() 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString s = ss.unescape(); 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t byteLength = 0; 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *bytes = extractBytes(s, "ISO-8859-1", byteLength); 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCharsetMatch *match; 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *lang, *name; 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_enableInputFilter(csd, TRUE); 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!ucsdet_isInputFilterEnabled(csd)) { 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); 437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, bytes, byteLength, &status); 441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Turning on the input filter resulted in no matches."); 445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto turn_off; 446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name = ucsdet_getName(match, &status); 449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); 452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lang = ucsdet_getLanguage(match, &status); 454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lang == NULL || strcmp(lang, "fr") != 0) { 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Input filter did not strip markup!"); 457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruturn_off: 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_enableInputFilter(csd, FALSE); 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, bytes, byteLength, &status); 463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Turning off the input filter resulted in no matches."); 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name = ucsdet_getName(match, &status); 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lang = ucsdet_getLanguage(match, &status); 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (lang == NULL || strcmp(lang, "en") != 0) { 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Unfiltered input did not detect as English!"); 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail: 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(bytes); 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_close(csd); 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::C1BytesTest() 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_LEGACY_CONVERSION 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 492c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); 493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString sWindows = ssWindows.unescape(); 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lISO = 0, lWindows = 0; 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UCharsetMatch *match; 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *name; 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, bWindows, lWindows, &status); 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 5056d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status)); 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name = ucsdet_getName(match, &status); 510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strcmp(name, "windows-1252") != 0) { 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_setText(csd, bISO, lISO, &status); 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru match = ucsdet_detect(csd, &status); 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (match == NULL) { 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("English text without C1 bytes got no matches."); 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto bail; 521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name = ucsdet_getName(match, &status); 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (strcmp(name, "ISO-8859-1") != 0) { 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail: 530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(bWindows); 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru freeBytes(bISO); 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucsdet_close(csd); 534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::DetectionTest() 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char path[2048]; 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *testFilePath = getPath(path, "csdetest.xml"); 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testFilePath == NULL) { 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; /* Couldn't get path: error message already output. */ 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLParser *parser = UXMLParser::createParser(status); 5496d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru if (U_FAILURE(status)) { 5506d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); 5516d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru return; 5526d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru } 5536d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UXMLElement *root = parser->parseFile(testFilePath, status); 555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!assertSuccess( "parseFile",status)) return; 556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); 558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); 559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UXMLElement *testCase; 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tc = 0; 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while((testCase = root->nextChildElement(tc)) != NULL) { 565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testCase->getTagName().compare(test_case) == 0) { 566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *id = testCase->getAttribute(id_attr); 567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *encodings = testCase->getAttribute(enc_attr); 568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString text = testCase->getText(TRUE); 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t encodingCount; 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for(int32_t e = 0; e < encodingCount; e += 1) { 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru checkEncoding(text, encodingList[e], *id); 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] encodingList; 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete root; 581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete parser; 582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 585b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid CharsetDetectionTest::IBM424Test() 586b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 5871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#if !UCONFIG_ONLY_HTML_CONVERSION 588b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 589b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 590b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru static const UChar chars[] = { 591b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 592b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 593b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 594b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 595b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 596b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 597b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 598b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 599b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 600b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 601b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 602b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 603b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 604b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 605b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 606b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 607b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 608b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru }; 609b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 610b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru static const UChar chars_reverse[] = { 611b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 612b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 613b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 614b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 615b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 616b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 617b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 618b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 619b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 620b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 621b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 622b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 623b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 624b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 625b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 626b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 627b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 628b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0000 629b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru }; 630b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 631b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t bLength = 0, brLength = 0; 632b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 633b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString s1(chars); 634b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString s2(chars_reverse); 635b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 636b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char *bytes = extractBytes(s1, "IBM424", bLength); 637b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char *bytes_r = extractBytes(s2, "IBM424", brLength); 638b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 639b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 64059d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); 64159d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); 64259d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); 64359d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); 6446d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru if (U_FAILURE(status)) { 6456d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errln("Error opening charset detector. - %s", u_errorName(status)); 6466d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru } 647b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UCharsetMatch *match; 648b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const char *name; 649b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 650b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_setText(csd, bytes, bLength, &status); 651b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru match = ucsdet_detect(csd, &status); 652b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 653b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (match == NULL) { 6546d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status)); 655b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto bail; 656b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 657b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 658b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru name = ucsdet_getName(match, &status); 659b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strcmp(name, "IBM424_rtl") != 0) { 6606d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errln("Encoding detection failure for IBM424_rtl: got %s", name); 661b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 662b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 663b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_setText(csd, bytes_r, brLength, &status); 664b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru match = ucsdet_detect(csd, &status); 665b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 666b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (match == NULL) { 6676d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errln("Encoding detection failure for IBM424_ltr: got no matches."); 668b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto bail; 669b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 670b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 671b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru name = ucsdet_getName(match, &status); 672b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strcmp(name, "IBM424_ltr") != 0) { 6736d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errln("Encoding detection failure for IBM424_ltr: got %s", name); 674b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 675b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 676b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querubail: 677b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru freeBytes(bytes); 678b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru freeBytes(bytes_r); 679b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_close(csd); 6801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif 681b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 682b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 683b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid CharsetDetectionTest::IBM420Test() 684b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 6851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#if !UCONFIG_ONLY_HTML_CONVERSION 686b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 687b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 688b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru static const UChar chars[] = { 689b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 690b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 691b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 692b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 693b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 694b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 695b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 696b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 697b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 698b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 699b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 700b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 701b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 702b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0000 703b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru }; 704b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru static const UChar chars_reverse[] = { 705b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 706b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 707b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 708b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 709b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 710b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 711b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 712b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 713b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 714b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 715b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 716b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 717b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 718b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 0x0000, 719b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru }; 720b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 721b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t bLength = 0, brLength = 0; 722b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 723b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString s1(chars); 724b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString s2(chars_reverse); 725b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 726b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char *bytes = extractBytes(s1, "IBM420", bLength); 727b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char *bytes_r = extractBytes(s2, "IBM420", brLength); 728b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 729b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 7306d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru if (U_FAILURE(status)) { 7316d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errln("Error opening charset detector. - %s", u_errorName(status)); 7326d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru } 73359d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); 73459d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); 73559d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); 73659d709d503bab6e2b61931737e662dd293b40578ccornelius ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); 737b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UCharsetMatch *match; 738b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const char *name; 739b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 740b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_setText(csd, bytes, bLength, &status); 741b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru match = ucsdet_detect(csd, &status); 742b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 743b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (match == NULL) { 7446d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status)); 745b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto bail; 746b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 747b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 748b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru name = ucsdet_getName(match, &status); 749b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strcmp(name, "IBM420_rtl") != 0) { 750b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Encoding detection failure for IBM420_rtl: got %s\n", name); 751b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 752b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 753b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_setText(csd, bytes_r, brLength, &status); 754b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru match = ucsdet_detect(csd, &status); 755b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 756b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (match == NULL) { 757b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); 758b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto bail; 759b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 760b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 761b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru name = ucsdet_getName(match, &status); 762b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (strcmp(name, "IBM420_ltr") != 0) { 763b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Encoding detection failure for IBM420_ltr: got %s\n", name); 764b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 765b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 766b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querubail: 767b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru freeBytes(bytes); 768b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru freeBytes(bytes_r); 769b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_close(csd); 7701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif 771b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 772b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 773b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 774b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid CharsetDetectionTest::Ticket6394Test() { 775b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 776b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const char charText[] = "Here is some random English text that should be detected as ISO-8859-1." 777b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected " 778b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "encodings more than once. The hop through UnicodeString is for platforms " 779b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "where this char * string is be EBCDIC and needs conversion to Latin1."; 780b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char latin1Text[sizeof(charText)]; 781b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1"); 782b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 783b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 784b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UCharsetDetector *csd = ucsdet_open(&status); 785b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_setText(csd, latin1Text, -1, &status); 786b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 787b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 788b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return; 789b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 790b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 791b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t matchCount = 0; 792b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); 793b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 794b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 795b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return; 796b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 797b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 798b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. 799b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t i; 800b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (i=0; i<matchCount; i++) { 801b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString charSetName(ucsdet_getName(matches[i], &status)); 802b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 803b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i); 804b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_ZERO_ERROR; 805b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 806b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (setOfCharsetNames.contains(charSetName)) { 807b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln("Fail at file %s, line %d ", __FILE__, __LINE__); 808b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru errln(UnicodeString(" Duplicate charset name = ") + charSetName); 809b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 810b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru setOfCharsetNames.add(charSetName); 811b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 812b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ucsdet_close(csd); 813b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif 814b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 81654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 81754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between 81854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// similar Windows and non-Windows SBCS encodings. State was kept in the shared 81954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// Charset Recognizer objects, and could be overwritten. 82054dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusvoid CharsetDetectionTest::Ticket6954Test() { 82159d709d503bab6e2b61931737e662dd293b40578ccornelius#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING 82254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UErrorCode status = U_ZERO_ERROR; 82354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 82454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly." 82554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); 82654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeString sWindows = ssWindows.unescape(); 82754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t lISO = 0, lWindows = 0; 82854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); 82954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); 83054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 83154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // First do a plain vanilla detect of 1252 text 83254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 83354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UCharsetDetector *csd1 = ucsdet_open(&status); 83454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ucsdet_setText(csd1, bWindows, lWindows, &status); 83554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const UCharsetMatch *match1 = ucsdet_detect(csd1, &status); 83654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const char *name1 = ucsdet_getName(match1, &status); 83754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius TEST_ASSERT_SUCCESS(status); 83854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius TEST_ASSERT(strcmp(name1, "windows-1252")==0); 83954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 84054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Next, using a completely separate detector, detect some 8859-1 text 84154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 84254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UCharsetDetector *csd2 = ucsdet_open(&status); 84354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ucsdet_setText(csd2, bISO, lISO, &status); 84454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const UCharsetMatch *match2 = ucsdet_detect(csd2, &status); 84554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const char *name2 = ucsdet_getName(match2, &status); 84654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius TEST_ASSERT_SUCCESS(status); 84754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0); 84854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 84954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Recheck the 1252 results from the first detector, which should not have been 85054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // altered by the use of a different detector. 85154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 85254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius name1 = ucsdet_getName(match1, &status); 85354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius TEST_ASSERT_SUCCESS(status); 85454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius TEST_ASSERT(strcmp(name1, "windows-1252")==0); 85554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 85654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ucsdet_close(csd1); 85754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ucsdet_close(csd2); 85854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius freeBytes(bISO); 85954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius freeBytes(bWindows); 86054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif 86154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius} 862