CharacterEncodingDetector.cpp revision 34fb29696b0f3abf61b10f8d053b1f33d501de0a
1544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen/* 2544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Copyright (C) 2013 The Android Open Source Project 3544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 4544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Licensed under the Apache License, Version 2.0 (the "License"); 5544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * you may not use this file except in compliance with the License. 6544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * You may obtain a copy of the License at 7544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 8544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * http://www.apache.org/licenses/LICENSE-2.0 9544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 10544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * Unless required by applicable law or agreed to in writing, software 11544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * distributed under the License is distributed on an "AS IS" BASIS, 12544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * See the License for the specific language governing permissions and 14544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * limitations under the License. 15544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen */ 16544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 17544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen//#define LOG_NDEBUG 0 18544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#define LOG_TAG "CharacterEncodingDector" 19544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include <utils/Log.h> 20544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 21544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "CharacterEncodingDetector.h" 22544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "CharacterEncodingDetectorTables.h" 23544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 24544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "utils/Vector.h" 25544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "StringArray.h" 26544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 27544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ucnv.h" 28544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ucsdet.h" 29544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen#include "unicode/ustring.h" 30544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 31544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissennamespace android { 32544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 33544ad2be674423238c47650d2c8588ba7dfc9ed2Marco NelissenCharacterEncodingDetector::CharacterEncodingDetector() { 34544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 35544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 36544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mUtf8Conv = ucnv_open("UTF-8", &status); 37544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 38544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("could not create UConverter for UTF-8"); 39544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mUtf8Conv = NULL; 40544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 41544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 42544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 43544ad2be674423238c47650d2c8588ba7dfc9ed2Marco NelissenCharacterEncodingDetector::~CharacterEncodingDetector() { 44544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(mUtf8Conv); 45544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 46544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 47544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenvoid CharacterEncodingDetector::addTag(const char *name, const char *value) { 48544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mNames.push_back(name); 49544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.push_back(value); 50544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 51544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 52544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissensize_t CharacterEncodingDetector::size() { 53544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return mNames.size(); 54544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 55544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 56544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenstatus_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) { 57544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (index >= mNames.size()) { 58544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return BAD_VALUE; 59544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 60544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 61544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *name = mNames.getEntry(index); 62544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *value = mValues.getEntry(index); 63544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return OK; 64544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 65544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 66544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenstatic bool isPrintableAscii(const char *value, size_t len) { 67544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < len; i++) { 68544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) { 69544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return false; 70544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 71544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 72544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return true; 73544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 74544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 75544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenvoid CharacterEncodingDetector::detectAndConvert() { 76544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 77544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int size = mNames.size(); 78544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d tags before conversion", size); 79544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 80544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i)); 81544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 82544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 83544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (size && mUtf8Conv) { 84544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 85544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 86544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UCharsetDetector *csd = ucsdet_open(&status); 87544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const UCharsetMatch *ucm; 88544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 89544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // try combined detection of artist/album/title etc. 90544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char buf[1024]; 91544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen buf[0] = 0; 92544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int idx; 93bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool allprintable = true; 94544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 95544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *name = mNames.getEntry(i); 96544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *value = mValues.getEntry(i); 97544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!isPrintableAscii(value, strlen(value)) && ( 98544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "artist") || 99544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "albumartist") || 100544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "composer") || 101544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "genre") || 102544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "album") || 103544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "title"))) { 104544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen strlcat(buf, value, sizeof(buf)); 105544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // separate tags by space so ICU's ngram detector can do its job 106544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen strlcat(buf, " ", sizeof(buf)); 107bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen allprintable = false; 108544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 109544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 110544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 111bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *combinedenc = "UTF-8"; 112bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (allprintable) { 113bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so 114bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // no need to even call it 11534fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf)); 116bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 117bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_setText(csd, buf, strlen(buf), &status); 118bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int32_t matches; 119bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); 120bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool goodmatch = true; 121bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), 122bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucma, matches, &goodmatch); 123bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen 124bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (!goodmatch && strlen(buf) < 20) { 125bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("not a good match, trying with more data"); 126bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // This string might be too short for ICU to do anything useful with. 127bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because 128bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // the ISO detector reports a confidence of 0, while the GB18030 detector reports 129bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // a confidence of 10 with no invalid characters) 130bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // Append artist, album and title if they were previously omitted because they 131bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen // were printable ascii. 132bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool added = false; 133bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen for (int i = 0; i < size; i++) { 134bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *name = mNames.getEntry(i); 135bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *value = mValues.getEntry(i); 136bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (isPrintableAscii(value, strlen(value)) && ( 137bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen !strcmp(name, "artist") || 138bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen !strcmp(name, "album") || 139bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen !strcmp(name, "title"))) { 140bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen strlcat(buf, value, sizeof(buf)); 141bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen strlcat(buf, " ", sizeof(buf)); 142bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen added = true; 143bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 144bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 145bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (added) { 146bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_setText(csd, buf, strlen(buf), &status); 147bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucma = ucsdet_detectAll(csd, &matches, &status); 148bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bestCombinedMatch = getPreferred(buf, strlen(buf), 149bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucma, matches, &goodmatch); 150bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (!goodmatch) { 151bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("still not a good match after adding printable tags"); 152bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 153bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 154bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("no printable tags to add"); 155bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 156bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 157544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 158bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (bestCombinedMatch != NULL) { 159bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen combinedenc = ucsdet_getName(bestCombinedMatch, &status); 160bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 161544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 162544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 163544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = 0; i < size; i++) { 164544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *name = mNames.getEntry(i); 165544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen uint8_t* src = (uint8_t *)mValues.getEntry(i); 166544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int len = strlen((char *)src); 167544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen uint8_t* dest = src; 168544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 169544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@ checking %s", name); 170544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *s = mValues.getEntry(i); 171544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int32_t inputLength = strlen(s); 172544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *enc; 173544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 1741392eb3d1802e9f894f87d7a7387207d1b6faca1Glenn Kasten if (!allprintable && (!strcmp(name, "artist") || 175544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "albumartist") || 176544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "composer") || 177544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "genre") || 178544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen !strcmp(name, "album") || 1791392eb3d1802e9f894f87d7a7387207d1b6faca1Glenn Kasten !strcmp(name, "title"))) { 180544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // use encoding determined from the combination of artist/album/title etc. 181544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen enc = combinedenc; 182544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 183bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (isPrintableAscii(s, inputLength)) { 184bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen enc = "UTF-8"; 185bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("@@@@ %s is ascii", mNames.getEntry(i)); 186bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 187bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_setText(csd, s, inputLength, &status); 188bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucm = ucsdet_detect(csd, &status); 189bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (!ucm) { 190bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen mValues.setEntry(i, "???"); 191bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen continue; 192bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 193bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen enc = ucsdet_getName(ucm, &status); 194bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("@@@@ recognized charset: %s for %s confidence %d", 195bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); 196544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 197544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 198544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 199544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (strcmp(enc,"UTF-8") != 0) { 200544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // only convert if the source encoding isn't already UTF-8 201544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i)); 202544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UConverter *conv = ucnv_open(enc, &status); 203544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 204544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("could not create UConverter for %s", enc); 205544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen continue; 206544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 207544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 208544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // convert from native encoding to UTF-8 209544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char* source = mValues.getEntry(i); 210544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int targetLength = len * 3 + 1; 211544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char* buffer = new char[targetLength]; 212544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // don't normally check for NULL, but in this case targetLength may be large 213544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!buffer) 214544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 215544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen char* target = buffer; 216544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 217544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength, 218544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen &source, source + strlen(source), 219544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen NULL, NULL, NULL, NULL, TRUE, TRUE, &status); 220544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 221544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (U_FAILURE(status)) { 222544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGE("ucnv_convertEx failed: %d", status); 223544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.setEntry(i, "???"); 224544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 225544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // zero terminate 226544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen *target = 0; 227544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.setEntry(i, buffer); 228544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 229544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 230544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen delete[] buffer; 231544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 232544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(conv); 233544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 234544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 235544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 236544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (int i = size - 1; i >= 0; --i) { 237544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (strlen(mValues.getEntry(i)) == 0) { 238544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("erasing %s because entry is empty", mNames.getEntry(i)); 239544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mNames.erase(i); 240544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mValues.erase(i); 241544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 242544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 243544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 244544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucsdet_close(csd); 245544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 246544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 247544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 248544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen/* 249544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * When ICU detects multiple encoding matches, apply additional heuristics to determine 250544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * which one is the best match, since ICU can't always be trusted to make the right choice. 251544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * 252544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * What this method does is: 253544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - decode the input using each of the matches found 254544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - recalculate the starting confidence level for multibyte encodings using a different 255544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * algorithm and larger frequent character lists than ICU 256544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) 257544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen * - pick the highest match 258bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen * - signal to the caller whether this match is considered good: confidence > 15, and confidence 259bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen * delta with the next runner up > 15 260544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen */ 261544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenconst UCharsetMatch *CharacterEncodingDetector::getPreferred( 262bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const char *input, size_t len, 263bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen const UCharsetMatch** ucma, size_t nummatches, 264bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen bool *goodmatch) { 265544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 266bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = false; 267544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen Vector<const UCharsetMatch*> matches; 268544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UErrorCode status = U_ZERO_ERROR; 269544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 27034fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("%zu matches", nummatches); 271544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < nummatches; i++) { 272544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *encname = ucsdet_getName(ucma[i], &status); 273544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int confidence = ucsdet_getConfidence(ucma[i], &status); 27434fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("%zu: %s %d", i, encname, confidence); 275544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen matches.push_back(ucma[i]); 276544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 277544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 278544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen size_t num = matches.size(); 279544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (num == 0) { 280544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return NULL; 281544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 282544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (num == 1) { 283bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int confidence = ucsdet_getConfidence(matches[0], &status); 284bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (confidence > 15) { 285bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = true; 286bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 287544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return matches[0]; 288544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 289544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 29034fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("considering %zu matches", num); 291544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 292544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // keep track of how many "special" characters result when converting the input using each 293544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // encoding 294544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen Vector<int> newconfidence; 295544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 0; i < num; i++) { 296544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const uint16_t *freqdata = NULL; 297544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen float freqcoverage = 0; 298544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 299544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *encname = ucsdet_getName(matches[i], &status); 300544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int confidence = ucsdet_getConfidence(matches[i], &status); 301544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!strcmp("GB18030", encname)) { 302544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_zhCN; 303544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_zhCN_coverage; 304544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("Big5", encname)) { 305544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_zhTW; 306544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_zhTW_coverage; 307544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("EUC-KR", encname)) { 308544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ko; 309544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ko_coverage; 310544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("EUC-JP", encname)) { 311544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ja; 312544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ja_coverage; 313544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (!strcmp("Shift_JIS", encname)) { 314544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqdata = frequent_ja; 315544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen freqcoverage = frequent_ja_coverage; 316544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 317544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 31834fb29696b0f3abf61b10f8d053b1f33d501de0aMark Salyzyn ALOGV("%zu: %s %d", i, encname, confidence); 319544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UConverter *conv = ucnv_open(encname, &status); 320544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *source = input; 321544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen const char *sourceLimit = input + len; 322544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 323544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int demerit = 0; 324544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int frequentchars = 0; 325544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int totalchars = 0; 326544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen while (true) { 327544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // demerit the current encoding for each "special" character found after conversion. 328544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // The amount of demerit is somewhat arbitrarily chosen. 329544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int inchar; 330544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (source != sourceLimit) { 331544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen inchar = (source[0] << 8) + source[1]; 332544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 333544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 334544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (!U_SUCCESS(status)) { 335544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 336544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 337544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) { 338544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("control character %x", c); 339544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 100; 340544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts 341544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen || (c == 0xd7) || (c == 0xf7) // multiplication and division signs 342544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts 343544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("unlikely character %x", c); 344544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 10; 345544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0xe000 && c <= 0xf8ff) { 346544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("private use character %x", c); 347544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 30; 348544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0x2190 && c <= 0x2bff) { 349544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // this range comprises various symbol ranges that are unlikely to appear in 350544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // music file metadata. 351544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("symbol %x", c); 352544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 10; 353544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c == 0xfffd) { 354544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("replacement character"); 355544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 50; 356544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c >= 0xfff0 && c <= 0xfffc) { 357544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("unicode special %x", c); 358544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen demerit += 50; 359544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (freqdata != NULL) { 360544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen totalchars++; 361544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (isFrequent(freqdata, c)) { 362544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen frequentchars++; 363544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 364544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 365544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 366544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (freqdata != NULL && totalchars != 0) { 367544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage; 368544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence, 369544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen totalchars, frequentchars); 370544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (myconfidence > 100) myconfidence = 100; 371544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (myconfidence < 0) myconfidence = 0; 372544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen confidence = myconfidence; 373544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 374544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit); 375544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen newconfidence.push_back(confidence - demerit); 376544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen ucnv_close(conv); 377544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (i == 0 && (confidence - demerit) == 100) { 378544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // no need to check any further, we'll end up using this match anyway 379544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen break; 380544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 381544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 382544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 383544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen // find match with highest confidence after adjusting for unlikely characters 384544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int highest = newconfidence[0]; 385544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen size_t highestidx = 0; 386bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int runnerup = -10000; 387bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen int runnerupidx = -10000; 388544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen num = newconfidence.size(); 389544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen for (size_t i = 1; i < num; i++) { 390544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if (newconfidence[i] > highest) { 391bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerup = highest; 392bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerupidx = highestidx; 393544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen highest = newconfidence[i]; 394544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen highestidx = i; 395bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else if (newconfidence[i] > runnerup){ 396bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerup = newconfidence[i]; 397bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen runnerupidx = i; 398544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 399544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 400544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen status = U_ZERO_ERROR; 401bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("selecting: '%s' w/ %d confidence", 402bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_getName(matches[highestidx], &status), highest); 403bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (runnerupidx < 0) { 404bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("no runner up"); 405bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if (highest > 15) { 406bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = true; 407bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 408bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } else { 409bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ALOGV("runner up: '%s' w/ %d confidence", 410bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen ucsdet_getName(matches[runnerupidx], &status), runnerup); 411bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen if ((highest - runnerup) > 15) { 412bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen *goodmatch = true; 413bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 414bfd55f243feb3f04e26ad07aae035475768ada8aMarco Nelissen } 415544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return matches[highestidx]; 416544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 417544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 418544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 419544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissenbool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) { 420544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 421544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int start = 0; 422544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int end = 511; // All the tables have 512 entries 423544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen int mid = (start+end)/2; 424544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 425544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen while(start <= end) { 426544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen if(c == values[mid]) { 427544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return true; 428544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else if (c > values[mid]) { 429544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen start = mid + 1; 430544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } else { 431544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen end = mid - 1; 432544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 433544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 434544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen mid = (start + end) / 2; 435544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen } 436544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 437544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen return false; 438544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} 439544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 440544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen 441544ad2be674423238c47650d2c8588ba7dfc9ed2Marco Nelissen} // namespace android 442