1544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen/* 2544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Copyright (C) 2013 The Android Open Source Project 3544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 4544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Licensed under the Apache License, Version 2.0 (the "License"); 5544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * you may not use this file except in compliance with the License. 6544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * You may obtain a copy of the License at 7544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 8544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * http://www.apache.org/licenses/LICENSE-2.0 9544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 10544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Unless required by applicable law or agreed to in writing, software 11544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * distributed under the License is distributed on an "AS IS" BASIS, 12544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * See the License for the specific language governing permissions and 14544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * limitations under the License. 15544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen */ 16544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 17544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen//#define LOG_NDEBUG 0 18544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#define LOG_TAG "CharacterEncodingDector" 19544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include <utils/Log.h> 20544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 2134581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen#include <CharacterEncodingDetector.h> 22544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "CharacterEncodingDetectorTables.h" 23544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 24544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "utils/Vector.h" 25544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "StringArray.h" 26544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 27544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ucnv.h" 28544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ucsdet.h" 29544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ustring.h" 30544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 31544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissennamespace android { 32544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 33544ad2be674423238c47650d2c8588ba7dfc9ed2Marco NelissenCharacterEncodingDetector::CharacterEncodingDetector() { 34544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 35544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 36544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mUtf8Conv = ucnv_open("UTF-8", &status); 37544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 38544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("could not create UConverter for UTF-8"); 39544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mUtf8Conv = NULL; 40544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 41544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 42544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 43544ad2be674423238c47650d2c8588ba7dfc9ed2Marco NelissenCharacterEncodingDetector::~CharacterEncodingDetector() { 44544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(mUtf8Conv); 45544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 46544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 47544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenvoid CharacterEncodingDetector::addTag(const char *name, const char *value) { 48544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mNames.push_back(name); 49544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.push_back(value); 50544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 51544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 52544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissensize_t CharacterEncodingDetector::size() { 53544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return mNames.size(); 54544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 55544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 56544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenstatus_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) { 57544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (index >= mNames.size()) { 58544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return BAD_VALUE; 59544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 60544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 61544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *name = mNames.getEntry(index); 62544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *value = mValues.getEntry(index); 63544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return OK; 64544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 65544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 66544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenstatic bool isPrintableAscii(const char *value, size_t len) { 67544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < len; i++) { 68544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) { 69544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return false; 70544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 71544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 72544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return true; 73544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 74544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 75544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenvoid CharacterEncodingDetector::detectAndConvert() { 76544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 77544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int size = mNames.size(); 78544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d tags before conversion", size); 79544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 80544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i)); 81544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 82544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 83544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (size && mUtf8Conv) { 84544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 85544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 86544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UCharsetDetector *csd = ucsdet_open(&status); 87544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const UCharsetMatch *ucm; 88544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 89544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // try combined detection of artist/album/title etc. 90544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char buf[1024]; 91544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen buf[0] = 0; 92544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int idx; 93bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool allprintable = true; 94544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 95544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *name = mNames.getEntry(i); 96544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *value = mValues.getEntry(i); 97544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!isPrintableAscii(value, strlen(value)) && ( 98544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "artist") || 99544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "albumartist") || 100544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "composer") || 101544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "genre") || 102544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "album") || 103544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "title"))) { 104544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen strlcat(buf, value, sizeof(buf)); 105544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // separate tags by space so ICU's ngram detector can do its job 106544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen strlcat(buf, " ", sizeof(buf)); 107bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen allprintable = false; 108544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 109544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 110544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 111bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *combinedenc = "UTF-8"; 112bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (allprintable) { 113bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so 114bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // no need to even call it 11534fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf)); 116bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 117bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_setText(csd, buf, strlen(buf), &status); 118bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int32_t matches; 119bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); 120bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool goodmatch = true; 12134581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen int highest = 0; 122bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), 12334581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen ucma, matches, &goodmatch, &highest); 124bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen 12534581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest); 12634581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen if (!goodmatch && (highest < 15 || strlen(buf) < 20)) { 127bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("not a good match, trying with more data"); 128bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // This string might be too short for ICU to do anything useful with. 129bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because 130bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // the ISO detector reports a confidence of 0, while the GB18030 detector reports 131bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // a confidence of 10 with no invalid characters) 132bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // Append artist, album and title if they were previously omitted because they 133bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // were printable ascii. 134bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool added = false; 135bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen for (int i = 0; i < size; i++) { 136bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *name = mNames.getEntry(i); 137bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *value = mValues.getEntry(i); 138bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (isPrintableAscii(value, strlen(value)) && ( 139bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen !strcmp(name, "artist") || 140bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen !strcmp(name, "album") || 141bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen !strcmp(name, "title"))) { 142bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen strlcat(buf, value, sizeof(buf)); 143bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen strlcat(buf, " ", sizeof(buf)); 144bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen added = true; 145bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 146bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 147bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (added) { 148bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_setText(csd, buf, strlen(buf), &status); 149bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucma = ucsdet_detectAll(csd, &matches, &status); 150bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bestCombinedMatch = getPreferred(buf, strlen(buf), 15134581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen ucma, matches, &goodmatch, &highest); 15234581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen if (!goodmatch && highest <= 15) { 153bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("still not a good match after adding printable tags"); 15434581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen bestCombinedMatch = NULL; 155bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 156bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 157bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("no printable tags to add"); 158bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 159bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 160544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 161bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (bestCombinedMatch != NULL) { 162bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen combinedenc = ucsdet_getName(bestCombinedMatch, &status); 16334581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } else { 16434581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen combinedenc = "ISO-8859-1"; 165bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 166544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 167544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 168544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 169544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *name = mNames.getEntry(i); 170544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen uint8_t* src = (uint8_t *)mValues.getEntry(i); 171544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int len = strlen((char *)src); 172544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen uint8_t* dest = src; 173544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 174544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@ checking %s", name); 175544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *s = mValues.getEntry(i); 176544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int32_t inputLength = strlen(s); 177544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *enc; 178544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 1791392eb3d1802e9f894f87d7a7387207d1b6faca1Glenn Kasten if (!allprintable && (!strcmp(name, "artist") || 180544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "albumartist") || 181544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "composer") || 182544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "genre") || 183544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "album") || 1841392eb3d1802e9f894f87d7a7387207d1b6faca1Glenn Kasten !strcmp(name, "title"))) { 185544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // use encoding determined from the combination of artist/album/title etc. 186544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen enc = combinedenc; 187544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 188bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (isPrintableAscii(s, inputLength)) { 189bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen enc = "UTF-8"; 190bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("@@@@ %s is ascii", mNames.getEntry(i)); 191bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 192bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_setText(csd, s, inputLength, &status); 193bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucm = ucsdet_detect(csd, &status); 194bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (!ucm) { 195bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen mValues.setEntry(i, "???"); 196bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen continue; 197bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 198bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen enc = ucsdet_getName(ucm, &status); 199bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("@@@@ recognized charset: %s for %s confidence %d", 200bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); 201544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 202544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 203544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 204544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (strcmp(enc,"UTF-8") != 0) { 205544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // only convert if the source encoding isn't already UTF-8 206544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i)); 20734581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen status = U_ZERO_ERROR; 208544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UConverter *conv = ucnv_open(enc, &status); 209544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 21034581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1", 21134581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen enc, status); 21234581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen status = U_ZERO_ERROR; 21334581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen conv = ucnv_open("ISO-8859-1", &status); 21434581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen if (U_FAILURE(status)) { 21534581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen ALOGW("could not create UConverter for ISO-8859-1 either"); 21634581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen continue; 21734581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } 218544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 219544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 220544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // convert from native encoding to UTF-8 221544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char* source = mValues.getEntry(i); 222544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int targetLength = len * 3 + 1; 223544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char* buffer = new char[targetLength]; 224544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // don't normally check for NULL, but in this case targetLength may be large 225544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!buffer) 226544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 227544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char* target = buffer; 228544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 229544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength, 230544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen &source, source + strlen(source), 231544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen NULL, NULL, NULL, NULL, TRUE, TRUE, &status); 232544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 233544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 234544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("ucnv_convertEx failed: %d", status); 235544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.setEntry(i, "???"); 236544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 237544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // zero terminate 238544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *target = 0; 23934581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen // strip trailing spaces 24034581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen while (--target > buffer && *target == ' ') { 24134581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen *target = 0; 24234581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } 24334581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen // skip leading spaces 24434581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen char *start = buffer; 24534581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen while (*start == ' ') { 24634581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen start++; 24734581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } 24834581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen mValues.setEntry(i, start); 249544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 250544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 251544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen delete[] buffer; 252544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 253544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(conv); 254544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 255544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 256544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 257544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = size - 1; i >= 0; --i) { 258544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (strlen(mValues.getEntry(i)) == 0) { 259544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("erasing %s because entry is empty", mNames.getEntry(i)); 260544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mNames.erase(i); 261544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.erase(i); 262544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 263544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 264544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 265544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucsdet_close(csd); 266544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 267544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 268544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 269544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen/* 270544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * When ICU detects multiple encoding matches, apply additional heuristics to determine 271544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * which one is the best match, since ICU can't always be trusted to make the right choice. 272544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 273544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * What this method does is: 274544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - decode the input using each of the matches found 275544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - recalculate the starting confidence level for multibyte encodings using a different 276544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * algorithm and larger frequent character lists than ICU 277544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) 278544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - pick the highest match 279bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen * - signal to the caller whether this match is considered good: confidence > 15, and confidence 280bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen * delta with the next runner up > 15 281544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen */ 282544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenconst UCharsetMatch *CharacterEncodingDetector::getPreferred( 283bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *input, size_t len, 284bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const UCharsetMatch** ucma, size_t nummatches, 28534581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen bool *goodmatch, int *highestmatch) { 286544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 287bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = false; 288544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen Vector<const UCharsetMatch*> matches; 289544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 290544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 29134fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("%zu matches", nummatches); 292544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < nummatches; i++) { 293544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *encname = ucsdet_getName(ucma[i], &status); 294544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int confidence = ucsdet_getConfidence(ucma[i], &status); 29534fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("%zu: %s %d", i, encname, confidence); 296544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen matches.push_back(ucma[i]); 297544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 298544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 299544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen size_t num = matches.size(); 300544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (num == 0) { 301544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return NULL; 302544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 303544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (num == 1) { 304bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int confidence = ucsdet_getConfidence(matches[0], &status); 305bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (confidence > 15) { 306bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = true; 307bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 308544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return matches[0]; 309544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 310544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 31134fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("considering %zu matches", num); 312544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 313544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // keep track of how many "special" characters result when converting the input using each 314544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // encoding 315544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen Vector<int> newconfidence; 316544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < num; i++) { 317544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const uint16_t *freqdata = NULL; 318544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen float freqcoverage = 0; 319544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 320544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *encname = ucsdet_getName(matches[i], &status); 321544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int confidence = ucsdet_getConfidence(matches[i], &status); 322544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!strcmp("GB18030", encname)) { 323544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_zhCN; 324544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_zhCN_coverage; 325544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("Big5", encname)) { 326544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_zhTW; 327544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_zhTW_coverage; 328544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("EUC-KR", encname)) { 329544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ko; 330544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ko_coverage; 331544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("EUC-JP", encname)) { 332544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ja; 333544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ja_coverage; 334544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("Shift_JIS", encname)) { 335544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ja; 336544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ja_coverage; 337544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 338544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 33934fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("%zu: %s %d", i, encname, confidence); 34034581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen status = U_ZERO_ERROR; 341544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UConverter *conv = ucnv_open(encname, &status); 34234581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen int demerit = 0; 34334581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen if (U_FAILURE(status)) { 34434581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen ALOGV("failed to open %s: %d", encname, status); 34534581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen confidence = 0; 34634581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen demerit += 1000; 34734581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } 348544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *source = input; 349544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *sourceLimit = input + len; 350544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 351544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int frequentchars = 0; 352544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int totalchars = 0; 353544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen while (true) { 354544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // demerit the current encoding for each "special" character found after conversion. 355544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // The amount of demerit is somewhat arbitrarily chosen. 356544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int inchar; 357544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (source != sourceLimit) { 358544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen inchar = (source[0] << 8) + source[1]; 359544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 360544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 361544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!U_SUCCESS(status)) { 362544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 363544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 364544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) { 365544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("control character %x", c); 366544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 100; 36734581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } else if ((c == 0xa0) // no-break space 36834581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen || (c >= 0xa2 && c <= 0xbe) // symbols, superscripts 369544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen || (c == 0xd7) || (c == 0xf7) // multiplication and division signs 370544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts 371544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("unlikely character %x", c); 372544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 10; 373544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0xe000 && c <= 0xf8ff) { 374544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("private use character %x", c); 375544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 30; 376544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0x2190 && c <= 0x2bff) { 377544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // this range comprises various symbol ranges that are unlikely to appear in 378544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // music file metadata. 379544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("symbol %x", c); 380544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 10; 381544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c == 0xfffd) { 382544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("replacement character"); 383544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 50; 384544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0xfff0 && c <= 0xfffc) { 385544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("unicode special %x", c); 386544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 50; 387544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (freqdata != NULL) { 388544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen totalchars++; 389544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (isFrequent(freqdata, c)) { 390544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen frequentchars++; 391544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 392544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 393544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 394544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (freqdata != NULL && totalchars != 0) { 395544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage; 396544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence, 397544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen totalchars, frequentchars); 398544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (myconfidence > 100) myconfidence = 100; 399544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (myconfidence < 0) myconfidence = 0; 400544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen confidence = myconfidence; 401544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 402544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit); 403544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen newconfidence.push_back(confidence - demerit); 404544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(conv); 405544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (i == 0 && (confidence - demerit) == 100) { 406544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // no need to check any further, we'll end up using this match anyway 407544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 408544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 409544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 410544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 411544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // find match with highest confidence after adjusting for unlikely characters 412544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int highest = newconfidence[0]; 413544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen size_t highestidx = 0; 414bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int runnerup = -10000; 415bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int runnerupidx = -10000; 416544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen num = newconfidence.size(); 417544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 1; i < num; i++) { 418544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (newconfidence[i] > highest) { 419bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerup = highest; 420bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerupidx = highestidx; 421544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen highest = newconfidence[i]; 422544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen highestidx = i; 423bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else if (newconfidence[i] > runnerup){ 424bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerup = newconfidence[i]; 425bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerupidx = i; 426544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 427544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 428544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 429bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("selecting: '%s' w/ %d confidence", 430bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_getName(matches[highestidx], &status), highest); 431bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (runnerupidx < 0) { 432bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("no runner up"); 433bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (highest > 15) { 434bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = true; 435bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 436bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 437bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("runner up: '%s' w/ %d confidence", 438bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_getName(matches[runnerupidx], &status), runnerup); 43934581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen if (runnerup < 0) { 44034581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen runnerup = 0; 44134581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen } 442bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if ((highest - runnerup) > 15) { 443bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = true; 444bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 445bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 44634581f44cde67960fbac3ba1f191a2c063ea5145Marco Nelissen *highestmatch = highest; 447544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return matches[highestidx]; 448544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 449544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 450544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 451544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenbool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) { 452544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 453544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int start = 0; 454544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int end = 511; // All the tables have 512 entries 455544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int mid = (start+end)/2; 456544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 457544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen while(start <= end) { 458544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if(c == values[mid]) { 459544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return true; 460544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c > values[mid]) { 461544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen start = mid + 1; 462544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 463544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen end = mid - 1; 464544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 465544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 466544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mid = (start + end) / 2; 467544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 468544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 469544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return false; 470544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 471544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 472544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 473544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} // namespace android 474