CharacterEncodingDetector.cpp revision 544ad2be674423238c47650d2c8588ba7dfc9ed2
1544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen/* 2544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Copyright (C) 2013 The Android Open Source Project 3544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 4544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Licensed under the Apache License, Version 2.0 (the "License"); 5544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * you may not use this file except in compliance with the License. 6544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * You may obtain a copy of the License at 7544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 8544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * http://www.apache.org/licenses/LICENSE-2.0 9544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 10544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Unless required by applicable law or agreed to in writing, software 11544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * distributed under the License is distributed on an "AS IS" BASIS, 12544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * See the License for the specific language governing permissions and 14544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * limitations under the License. 15544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen */ 16544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 17544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen//#define LOG_NDEBUG 0 18544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#define LOG_TAG "CharacterEncodingDector" 19544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include <utils/Log.h> 20544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 21544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "CharacterEncodingDetector.h" 22544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "CharacterEncodingDetectorTables.h" 23544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 24544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "utils/Vector.h" 25544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "StringArray.h" 26544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 27544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ucnv.h" 28544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ucsdet.h" 29544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ustring.h" 30544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 31544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissennamespace android { 32544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 33544ad2be674423238c47650d2c8588ba7dfc9ed2Marco NelissenCharacterEncodingDetector::CharacterEncodingDetector() { 34544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 35544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 36544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mUtf8Conv = ucnv_open("UTF-8", &status); 37544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 38544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("could not create UConverter for UTF-8"); 39544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mUtf8Conv = NULL; 40544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 41544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 42544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 43544ad2be674423238c47650d2c8588ba7dfc9ed2Marco NelissenCharacterEncodingDetector::~CharacterEncodingDetector() { 44544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(mUtf8Conv); 45544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 46544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 47544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenvoid CharacterEncodingDetector::addTag(const char *name, const char *value) { 48544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mNames.push_back(name); 49544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.push_back(value); 50544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 51544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 52544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissensize_t CharacterEncodingDetector::size() { 53544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return mNames.size(); 54544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 55544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 56544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenstatus_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) { 57544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (index >= mNames.size()) { 58544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return BAD_VALUE; 59544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 60544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 61544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *name = mNames.getEntry(index); 62544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *value = mValues.getEntry(index); 63544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return OK; 64544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 65544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 66544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenstatic bool isPrintableAscii(const char *value, size_t len) { 67544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < len; i++) { 68544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) { 69544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return false; 70544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 71544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 72544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return true; 73544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 74544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 75544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenvoid CharacterEncodingDetector::detectAndConvert() { 76544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 77544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int size = mNames.size(); 78544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d tags before conversion", size); 79544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 80544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i)); 81544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 82544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 83544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (size && mUtf8Conv) { 84544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 85544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 86544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UCharsetDetector *csd = ucsdet_open(&status); 87544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const UCharsetMatch *ucm; 88544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 89544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // try combined detection of artist/album/title etc. 90544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char buf[1024]; 91544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen buf[0] = 0; 92544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int idx; 93544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 94544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *name = mNames.getEntry(i); 95544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *value = mValues.getEntry(i); 96544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!isPrintableAscii(value, strlen(value)) && ( 97544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "artist") || 98544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "albumartist") || 99544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "composer") || 100544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "genre") || 101544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "album") || 102544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "title"))) { 103544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen strlcat(buf, value, sizeof(buf)); 104544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // separate tags by space so ICU's ngram detector can do its job 105544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen strlcat(buf, " ", sizeof(buf)); 106544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 107544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 108544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucsdet_setText(csd, buf, strlen(buf), &status); 109544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 110544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int32_t matches; 111544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); 112544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *combinedenc = "???"; 113544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 114544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches); 115544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 116544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (bestCombinedMatch != NULL) { 117544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen combinedenc = ucsdet_getName(bestCombinedMatch, &status); 118544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 119544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 120544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 121544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *name = mNames.getEntry(i); 122544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen uint8_t* src = (uint8_t *)mValues.getEntry(i); 123544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int len = strlen((char *)src); 124544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen uint8_t* dest = src; 125544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 126544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@ checking %s", name); 127544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *s = mValues.getEntry(i); 128544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int32_t inputLength = strlen(s); 129544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *enc; 130544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 131544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!strcmp(name, "artist") || 132544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "albumartist") || 133544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "composer") || 134544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "genre") || 135544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "album") || 136544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "title")) { 137544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // use encoding determined from the combination of artist/album/title etc. 138544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen enc = combinedenc; 139544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 140544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucsdet_setText(csd, s, inputLength, &status); 141544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucm = ucsdet_detect(csd, &status); 142544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!ucm) { 143544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.setEntry(i, "???"); 144544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen continue; 145544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 146544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen enc = ucsdet_getName(ucm, &status); 147544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@@ recognized charset: %s for %s confidence %d", 148544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); 149544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 150544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 151544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (strcmp(enc,"UTF-8") != 0) { 152544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // only convert if the source encoding isn't already UTF-8 153544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i)); 154544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UConverter *conv = ucnv_open(enc, &status); 155544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 156544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("could not create UConverter for %s", enc); 157544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen continue; 158544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 159544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 160544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // convert from native encoding to UTF-8 161544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char* source = mValues.getEntry(i); 162544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int targetLength = len * 3 + 1; 163544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char* buffer = new char[targetLength]; 164544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // don't normally check for NULL, but in this case targetLength may be large 165544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!buffer) 166544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 167544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char* target = buffer; 168544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 169544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength, 170544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen &source, source + strlen(source), 171544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen NULL, NULL, NULL, NULL, TRUE, TRUE, &status); 172544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 173544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 174544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("ucnv_convertEx failed: %d", status); 175544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.setEntry(i, "???"); 176544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 177544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // zero terminate 178544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *target = 0; 179544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.setEntry(i, buffer); 180544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 181544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 182544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen delete[] buffer; 183544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 184544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(conv); 185544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 186544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 187544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 188544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = size - 1; i >= 0; --i) { 189544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (strlen(mValues.getEntry(i)) == 0) { 190544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("erasing %s because entry is empty", mNames.getEntry(i)); 191544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mNames.erase(i); 192544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.erase(i); 193544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 194544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 195544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 196544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucsdet_close(csd); 197544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 198544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 199544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 200544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen/* 201544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * When ICU detects multiple encoding matches, apply additional heuristics to determine 202544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * which one is the best match, since ICU can't always be trusted to make the right choice. 203544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 204544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * What this method does is: 205544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - decode the input using each of the matches found 206544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - recalculate the starting confidence level for multibyte encodings using a different 207544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * algorithm and larger frequent character lists than ICU 208544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) 209544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - pick the highest match 210544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen */ 211544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenconst UCharsetMatch *CharacterEncodingDetector::getPreferred( 212544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) { 213544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 214544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen Vector<const UCharsetMatch*> matches; 215544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 216544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 217544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d matches", nummatches); 218544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < nummatches; i++) { 219544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *encname = ucsdet_getName(ucma[i], &status); 220544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int confidence = ucsdet_getConfidence(ucma[i], &status); 221544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d: %s %d", i, encname, confidence); 222544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen matches.push_back(ucma[i]); 223544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 224544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 225544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen size_t num = matches.size(); 226544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (num == 0) { 227544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return NULL; 228544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 229544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (num == 1) { 230544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return matches[0]; 231544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 232544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 233544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("considering %d matches", num); 234544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 235544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // keep track of how many "special" characters result when converting the input using each 236544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // encoding 237544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen Vector<int> newconfidence; 238544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < num; i++) { 239544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const uint16_t *freqdata = NULL; 240544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen float freqcoverage = 0; 241544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 242544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *encname = ucsdet_getName(matches[i], &status); 243544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int confidence = ucsdet_getConfidence(matches[i], &status); 244544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!strcmp("GB18030", encname)) { 245544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_zhCN; 246544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_zhCN_coverage; 247544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("Big5", encname)) { 248544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_zhTW; 249544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_zhTW_coverage; 250544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("EUC-KR", encname)) { 251544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ko; 252544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ko_coverage; 253544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("EUC-JP", encname)) { 254544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ja; 255544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ja_coverage; 256544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("Shift_JIS", encname)) { 257544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ja; 258544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ja_coverage; 259544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 260544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 261544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d: %s %d", i, encname, confidence); 262544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UConverter *conv = ucnv_open(encname, &status); 263544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *source = input; 264544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *sourceLimit = input + len; 265544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 266544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int demerit = 0; 267544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int frequentchars = 0; 268544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int totalchars = 0; 269544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen while (true) { 270544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // demerit the current encoding for each "special" character found after conversion. 271544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // The amount of demerit is somewhat arbitrarily chosen. 272544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int inchar; 273544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (source != sourceLimit) { 274544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen inchar = (source[0] << 8) + source[1]; 275544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 276544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 277544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!U_SUCCESS(status)) { 278544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 279544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 280544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) { 281544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("control character %x", c); 282544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 100; 283544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts 284544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen || (c == 0xd7) || (c == 0xf7) // multiplication and division signs 285544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts 286544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("unlikely character %x", c); 287544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 10; 288544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0xe000 && c <= 0xf8ff) { 289544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("private use character %x", c); 290544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 30; 291544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0x2190 && c <= 0x2bff) { 292544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // this range comprises various symbol ranges that are unlikely to appear in 293544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // music file metadata. 294544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("symbol %x", c); 295544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 10; 296544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c == 0xfffd) { 297544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("replacement character"); 298544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 50; 299544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0xfff0 && c <= 0xfffc) { 300544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("unicode special %x", c); 301544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 50; 302544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (freqdata != NULL) { 303544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen totalchars++; 304544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (isFrequent(freqdata, c)) { 305544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen frequentchars++; 306544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 307544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 308544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 309544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (freqdata != NULL && totalchars != 0) { 310544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage; 311544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence, 312544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen totalchars, frequentchars); 313544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (myconfidence > 100) myconfidence = 100; 314544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (myconfidence < 0) myconfidence = 0; 315544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen confidence = myconfidence; 316544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 317544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit); 318544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen newconfidence.push_back(confidence - demerit); 319544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(conv); 320544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (i == 0 && (confidence - demerit) == 100) { 321544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // no need to check any further, we'll end up using this match anyway 322544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 323544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 324544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 325544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 326544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // find match with highest confidence after adjusting for unlikely characters 327544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int highest = newconfidence[0]; 328544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen size_t highestidx = 0; 329544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen num = newconfidence.size(); 330544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 1; i < num; i++) { 331544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (newconfidence[i] > highest) { 332544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen highest = newconfidence[i]; 333544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen highestidx = i; 334544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 335544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 336544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 337544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest); 338544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return matches[highestidx]; 339544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 340544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 341544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 342544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenbool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) { 343544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 344544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int start = 0; 345544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int end = 511; // All the tables have 512 entries 346544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int mid = (start+end)/2; 347544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 348544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen while(start <= end) { 349544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if(c == values[mid]) { 350544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return true; 351544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c > values[mid]) { 352544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen start = mid + 1; 353544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 354544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen end = mid - 1; 355544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 356544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 357544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mid = (start + end) / 2; 358544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 359544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 360544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return false; 361544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 362544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 363544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 364544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} // namespace android 365