1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4 ********************************************************************** 5 * Copyright (C) 2005-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10#include "unicode/utypes.h" 11 12#if !UCONFIG_NO_CONVERSION 13 14#include "csrucode.h" 15#include "csmatch.h" 16 17U_NAMESPACE_BEGIN 18 19CharsetRecog_Unicode::~CharsetRecog_Unicode() 20{ 21 // nothing to do 22} 23 24CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() 25{ 26 // nothing to do 27} 28 29const char *CharsetRecog_UTF_16_BE::getName() const 30{ 31 return "UTF-16BE"; 32} 33 34// UTF-16 confidence calculation. Very simple minded, but better than nothing. 35// Any 8 bit non-control characters bump the confidence up. These have a zero high byte, 36// and are very likely to be UTF-16, although they could also be part of a UTF-32 code. 37// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. 38// NULs should be rare in actual text. 39 40static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { 41 if (codeUnit == 0) { 42 confidence -= 10; 43 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { 44 confidence += 10; 45 } 46 if (confidence < 0) { 47 confidence = 0; 48 } else if (confidence > 100) { 49 confidence = 100; 50 } 51 return confidence; 52} 53 54 55UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const 56{ 57 const uint8_t *input = textIn->fRawInput; 58 int32_t confidence = 10; 59 int32_t length = textIn->fRawLength; 60 61 int32_t bytesToCheck = (length > 30) ? 30 : length; 62 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { 63 UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1]; 64 if (charIndex == 0 && codeUnit == 0xFEFF) { 65 confidence = 100; 66 break; 67 } 68 confidence = adjustConfidence(codeUnit, confidence); 69 if (confidence == 0 || confidence == 100) { 70 break; 71 } 72 } 73 if (bytesToCheck < 4 && confidence < 100) { 74 confidence = 0; 75 } 76 results->set(textIn, this, confidence); 77 return (confidence > 0); 78} 79 80CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() 81{ 82 // nothing to do 83} 84 85const char *CharsetRecog_UTF_16_LE::getName() const 86{ 87 return "UTF-16LE"; 88} 89 90UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const 91{ 92 const uint8_t *input = textIn->fRawInput; 93 int32_t confidence = 10; 94 int32_t length = textIn->fRawLength; 95 96 int32_t bytesToCheck = (length > 30) ? 30 : length; 97 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { 98 UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8); 99 if (charIndex == 0 && codeUnit == 0xFEFF) { 100 confidence = 100; // UTF-16 BOM 101 if (length >= 4 && input[2] == 0 && input[3] == 0) { 102 confidence = 0; // UTF-32 BOM 103 } 104 break; 105 } 106 confidence = adjustConfidence(codeUnit, confidence); 107 if (confidence == 0 || confidence == 100) { 108 break; 109 } 110 } 111 if (bytesToCheck < 4 && confidence < 100) { 112 confidence = 0; 113 } 114 results->set(textIn, this, confidence); 115 return (confidence > 0); 116} 117 118CharsetRecog_UTF_32::~CharsetRecog_UTF_32() 119{ 120 // nothing to do 121} 122 123UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const 124{ 125 const uint8_t *input = textIn->fRawInput; 126 int32_t limit = (textIn->fRawLength / 4) * 4; 127 int32_t numValid = 0; 128 int32_t numInvalid = 0; 129 bool hasBOM = FALSE; 130 int32_t confidence = 0; 131 132 if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) { 133 hasBOM = TRUE; 134 } 135 136 for(int32_t i = 0; i < limit; i += 4) { 137 int32_t ch = getChar(input, i); 138 139 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { 140 numInvalid += 1; 141 } else { 142 numValid += 1; 143 } 144 } 145 146 147 // Cook up some sort of confidence score, based on presense of a BOM 148 // and the existence of valid and/or invalid multi-byte sequences. 149 if (hasBOM && numInvalid==0) { 150 confidence = 100; 151 } else if (hasBOM && numValid > numInvalid*10) { 152 confidence = 80; 153 } else if (numValid > 3 && numInvalid == 0) { 154 confidence = 100; 155 } else if (numValid > 0 && numInvalid == 0) { 156 confidence = 80; 157 } else if (numValid > numInvalid*10) { 158 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. 159 confidence = 25; 160 } 161 162 results->set(textIn, this, confidence); 163 return (confidence > 0); 164} 165 166CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() 167{ 168 // nothing to do 169} 170 171const char *CharsetRecog_UTF_32_BE::getName() const 172{ 173 return "UTF-32BE"; 174} 175 176int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const 177{ 178 return input[index + 0] << 24 | input[index + 1] << 16 | 179 input[index + 2] << 8 | input[index + 3]; 180} 181 182CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() 183{ 184 // nothing to do 185} 186 187const char *CharsetRecog_UTF_32_LE::getName() const 188{ 189 return "UTF-32LE"; 190} 191 192int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const 193{ 194 return input[index + 3] << 24 | input[index + 2] << 16 | 195 input[index + 1] << 8 | input[index + 0]; 196} 197 198U_NAMESPACE_END 199#endif 200 201