CharacterEncodingDetector.cpp revision 34581f44cde67960fbac3ba1f191a2c063ea5145
16ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette/*
26ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * Copyright (C) 2013 The Android Open Source Project
36ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette *
46ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * Licensed under the Apache License, Version 2.0 (the "License");
56ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * you may not use this file except in compliance with the License.
66ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * You may obtain a copy of the License at
76ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette *
86ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette *      http://www.apache.org/licenses/LICENSE-2.0
96ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette *
106ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * Unless required by applicable law or agreed to in writing, software
116ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * distributed under the License is distributed on an "AS IS" BASIS,
126ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
136ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * See the License for the specific language governing permissions and
146ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette * limitations under the License.
156ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette */
166ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
176ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette//#define LOG_NDEBUG 0
186ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#define LOG_TAG "CharacterEncodingDector"
196ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#include <utils/Log.h>
20f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
216ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#include <CharacterEncodingDetector.h>
226ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#include "CharacterEncodingDetectorTables.h"
236ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
246ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#include "utils/Vector.h"
256ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#include "StringArray.h"
266ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
27f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette#include "unicode/ucnv.h"
28f6829a0a618b4523619ec53c996b04d67e3186b9Chris Craik#include "unicode/ucsdet.h"
296ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette#include "unicode/ustring.h"
306ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
316ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverettenamespace android {
326ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
33f872ee0057ed247aa93589347f1b53afc99517f8Alan ViveretteCharacterEncodingDetector::CharacterEncodingDetector() {
346ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
35f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    UErrorCode status = U_ZERO_ERROR;
360671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette    mUtf8Conv = ucnv_open("UTF-8", &status);
376ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    if (U_FAILURE(status)) {
386ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        ALOGE("could not create UConverter for UTF-8");
39f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        mUtf8Conv = NULL;
40f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    }
41f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette}
426ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
436ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan ViveretteCharacterEncodingDetector::~CharacterEncodingDetector() {
44f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    ucnv_close(mUtf8Conv);
45f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette}
46f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
47f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverettevoid CharacterEncodingDetector::addTag(const char *name, const char *value) {
486ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    mNames.push_back(name);
496ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    mValues.push_back(value);
50f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette}
516dfa60f33ca6018959ebff1efde82db7d2aed1e3Alan Viverette
520671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverettesize_t CharacterEncodingDetector::size() {
530671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette    return mNames.size();
540671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette}
550671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette
560671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverettestatus_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
57b7303a36baf8d0ac3efdeeee3310ef5974ba9ceaJorim Jaggi    if (index >= mNames.size()) {
580671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette        return BAD_VALUE;
590671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette    }
606ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
616ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    *name = mNames.getEntry(index);
62f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    *value = mValues.getEntry(index);
63f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    return OK;
646ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette}
656ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
66f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverettestatic bool isPrintableAscii(const char *value, size_t len) {
67f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    for (size_t i = 0; i < len; i++) {
686ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
696ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            return false;
70f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        }
71f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    }
72f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette    return true;
73cc3c573334a9cd2124a8a0ccf2f37884e36f83faAlan Viverette}
74f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
75f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverettevoid CharacterEncodingDetector::detectAndConvert() {
766ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
776ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    int size = mNames.size();
786ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    ALOGV("%d tags before conversion", size);
796ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    for (int i = 0; i < size; i++) {
806ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
816ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette    }
82f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
83f6829a0a618b4523619ec53c996b04d67e3186b9Chris Craik    if (size && mUtf8Conv) {
84f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
85f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        UErrorCode status = U_ZERO_ERROR;
866ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        UCharsetDetector *csd = ucsdet_open(&status);
876ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        const UCharsetMatch *ucm;
88f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
89f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        // try combined detection of artist/album/title etc.
90f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        char buf[1024];
91f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        buf[0] = 0;
92f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        int idx;
93fdbb98e56d4668c7bfa8de59c3c438c0cb69a535Alan Viverette        bool allprintable = true;
94f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        for (int i = 0; i < size; i++) {
95f92f26fef215897bd302c1c06adbe5d853881b3fAlan Viverette            const char *name = mNames.getEntry(i);
96f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            const char *value = mValues.getEntry(i);
97f92f26fef215897bd302c1c06adbe5d853881b3fAlan Viverette            if (!isPrintableAscii(value, strlen(value)) && (
986ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                        !strcmp(name, "artist") ||
99f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                        !strcmp(name, "albumartist") ||
1006dfa60f33ca6018959ebff1efde82db7d2aed1e3Alan Viverette                        !strcmp(name, "composer") ||
1016dfa60f33ca6018959ebff1efde82db7d2aed1e3Alan Viverette                        !strcmp(name, "genre") ||
102f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                        !strcmp(name, "album") ||
103f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                        !strcmp(name, "title"))) {
104f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                strlcat(buf, value, sizeof(buf));
1056dfa60f33ca6018959ebff1efde82db7d2aed1e3Alan Viverette                // separate tags by space so ICU's ngram detector can do its job
106f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                strlcat(buf, " ", sizeof(buf));
107f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                allprintable = false;
108f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            }
109f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        }
110f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
1116ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        const char *combinedenc = "UTF-8";
112f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        if (allprintable) {
1136ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
114f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            // no need to even call it
1150671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette            ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
1160671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette        } else {
117f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            ucsdet_setText(csd, buf, strlen(buf), &status);
118f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            int32_t matches;
119f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
120f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            bool goodmatch = true;
121f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            int highest = 0;
1226ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
123f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    ucma, matches, &goodmatch, &highest);
1246ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
1256ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
126f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
1276ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                ALOGV("not a good match, trying with more data");
1286ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                // This string might be too short for ICU to do anything useful with.
129f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
130f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                //  the ISO detector reports a confidence of 0, while the GB18030 detector reports
131f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                //  a confidence of 10 with no invalid characters)
1326ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                // Append artist, album and title if they were previously omitted because they
133f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                // were printable ascii.
134f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                bool added = false;
135f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                for (int i = 0; i < size; i++) {
1366ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                    const char *name = mNames.getEntry(i);
137f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    const char *value = mValues.getEntry(i);
138f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    if (isPrintableAscii(value, strlen(value)) && (
139f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                                !strcmp(name, "artist") ||
140f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                                !strcmp(name, "album") ||
1416ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                                !strcmp(name, "title"))) {
1420671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette                        strlcat(buf, value, sizeof(buf));
1430671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette                        strlcat(buf, " ", sizeof(buf));
1446ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                        added = true;
145f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    }
146f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                }
147f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                if (added) {
148f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    ucsdet_setText(csd, buf, strlen(buf), &status);
149f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    ucma = ucsdet_detectAll(csd, &matches, &status);
1500671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette                    bestCombinedMatch = getPreferred(buf, strlen(buf),
1510671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette                            ucma, matches, &goodmatch, &highest);
1520671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette                    if (!goodmatch && highest <= 15) {
1530671f05fa94a234652c1cf3c6e0c2e123566f76fAlan Viverette                        ALOGV("still not a good match after adding printable tags");
154f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                        bestCombinedMatch = NULL;
1556ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                    }
156191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette                } else {
157191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette                    ALOGV("no printable tags to add");
158191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette                }
159191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette            }
160191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette
161191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette            if (bestCombinedMatch != NULL) {
162191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette                combinedenc = ucsdet_getName(bestCombinedMatch, &status);
163191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette            } else {
164191ac0a72b0a7ba883d9cebee6eff71fbea4ef17Alan Viverette                combinedenc = "ISO-8859-1";
165f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            }
1666ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette        }
1676ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
168f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette        for (int i = 0; i < size; i++) {
169f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            const char *name = mNames.getEntry(i);
170f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            uint8_t* src = (uint8_t *)mValues.getEntry(i);
1716ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            int len = strlen((char *)src);
1726ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            uint8_t* dest = src;
173f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette
174f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            ALOGV("@@@ checking %s", name);
175f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            const char *s = mValues.getEntry(i);
1766ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            int32_t inputLength = strlen(s);
1776ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette            const char *enc;
1786ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette
179f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            if (!allprintable && (!strcmp(name, "artist") ||
180f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    !strcmp(name, "albumartist") ||
181f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    !strcmp(name, "composer") ||
182f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    !strcmp(name, "genre") ||
183f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                    !strcmp(name, "album") ||
1849bc11ac168d63900589158074028e6c480579421Alan Viverette                    !strcmp(name, "title"))) {
1859bc11ac168d63900589158074028e6c480579421Alan Viverette                // use encoding determined from the combination of artist/album/title etc.
1866ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                enc = combinedenc;
187f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette            } else {
188f872ee0057ed247aa93589347f1b53afc99517f8Alan Viverette                if (isPrintableAscii(s, inputLength)) {
1896ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                    enc = "UTF-8";
1906ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                    ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
1916ce6d70f9c78f0197f1369246bf55a5f6b8d7ba4Alan Viverette                } else {
192                    ucsdet_setText(csd, s, inputLength, &status);
193                    ucm = ucsdet_detect(csd, &status);
194                    if (!ucm) {
195                        mValues.setEntry(i, "???");
196                        continue;
197                    }
198                    enc = ucsdet_getName(ucm, &status);
199                    ALOGV("@@@@ recognized charset: %s for %s confidence %d",
200                            enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
201                }
202            }
203
204            if (strcmp(enc,"UTF-8") != 0) {
205                // only convert if the source encoding isn't already UTF-8
206                ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
207                status = U_ZERO_ERROR;
208                UConverter *conv = ucnv_open(enc, &status);
209                if (U_FAILURE(status)) {
210                    ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
211                            enc, status);
212                    status = U_ZERO_ERROR;
213                    conv = ucnv_open("ISO-8859-1", &status);
214                    if (U_FAILURE(status)) {
215                        ALOGW("could not create UConverter for ISO-8859-1 either");
216                        continue;
217                    }
218                }
219
220                // convert from native encoding to UTF-8
221                const char* source = mValues.getEntry(i);
222                int targetLength = len * 3 + 1;
223                char* buffer = new char[targetLength];
224                // don't normally check for NULL, but in this case targetLength may be large
225                if (!buffer)
226                    break;
227                char* target = buffer;
228
229                ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
230                        &source, source + strlen(source),
231                        NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
232
233                if (U_FAILURE(status)) {
234                    ALOGE("ucnv_convertEx failed: %d", status);
235                    mValues.setEntry(i, "???");
236                } else {
237                    // zero terminate
238                    *target = 0;
239                    // strip trailing spaces
240                    while (--target > buffer && *target == ' ') {
241                        *target = 0;
242                    }
243                    // skip leading spaces
244                    char *start = buffer;
245                    while (*start == ' ') {
246                        start++;
247                    }
248                    mValues.setEntry(i, start);
249                }
250
251                delete[] buffer;
252
253                ucnv_close(conv);
254            }
255        }
256
257        for (int i = size - 1; i >= 0; --i) {
258            if (strlen(mValues.getEntry(i)) == 0) {
259                ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
260                mNames.erase(i);
261                mValues.erase(i);
262            }
263        }
264
265        ucsdet_close(csd);
266    }
267}
268
269/*
270 * When ICU detects multiple encoding matches, apply additional heuristics to determine
271 * which one is the best match, since ICU can't always be trusted to make the right choice.
272 *
273 * What this method does is:
274 * - decode the input using each of the matches found
275 * - recalculate the starting confidence level for multibyte encodings using a different
276 *   algorithm and larger frequent character lists than ICU
277 * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
278 * - pick the highest match
279 * - signal to the caller whether this match is considered good: confidence > 15, and confidence
280 *   delta with the next runner up > 15
281 */
282const UCharsetMatch *CharacterEncodingDetector::getPreferred(
283        const char *input, size_t len,
284        const UCharsetMatch** ucma, size_t nummatches,
285        bool *goodmatch, int *highestmatch) {
286
287    *goodmatch = false;
288    Vector<const UCharsetMatch*> matches;
289    UErrorCode status = U_ZERO_ERROR;
290
291    ALOGV("%zu matches", nummatches);
292    for (size_t i = 0; i < nummatches; i++) {
293        const char *encname = ucsdet_getName(ucma[i], &status);
294        int confidence = ucsdet_getConfidence(ucma[i], &status);
295        ALOGV("%zu: %s %d", i, encname, confidence);
296        matches.push_back(ucma[i]);
297    }
298
299    size_t num = matches.size();
300    if (num == 0) {
301        return NULL;
302    }
303    if (num == 1) {
304        int confidence = ucsdet_getConfidence(matches[0], &status);
305        if (confidence > 15) {
306            *goodmatch = true;
307        }
308        return matches[0];
309    }
310
311    ALOGV("considering %zu matches", num);
312
313    // keep track of how many "special" characters result when converting the input using each
314    // encoding
315    Vector<int> newconfidence;
316    for (size_t i = 0; i < num; i++) {
317        const uint16_t *freqdata = NULL;
318        float freqcoverage = 0;
319        status = U_ZERO_ERROR;
320        const char *encname = ucsdet_getName(matches[i], &status);
321        int confidence = ucsdet_getConfidence(matches[i], &status);
322        if (!strcmp("GB18030", encname)) {
323            freqdata = frequent_zhCN;
324            freqcoverage = frequent_zhCN_coverage;
325        } else if (!strcmp("Big5", encname)) {
326            freqdata = frequent_zhTW;
327            freqcoverage = frequent_zhTW_coverage;
328        } else if (!strcmp("EUC-KR", encname)) {
329            freqdata = frequent_ko;
330            freqcoverage = frequent_ko_coverage;
331        } else if (!strcmp("EUC-JP", encname)) {
332            freqdata = frequent_ja;
333            freqcoverage = frequent_ja_coverage;
334        } else if (!strcmp("Shift_JIS", encname)) {
335            freqdata = frequent_ja;
336            freqcoverage = frequent_ja_coverage;
337        }
338
339        ALOGV("%zu: %s %d", i, encname, confidence);
340        status = U_ZERO_ERROR;
341        UConverter *conv = ucnv_open(encname, &status);
342        int demerit = 0;
343        if (U_FAILURE(status)) {
344            ALOGV("failed to open %s: %d", encname, status);
345            confidence = 0;
346            demerit += 1000;
347        }
348        const char *source = input;
349        const char *sourceLimit = input + len;
350        status = U_ZERO_ERROR;
351        int frequentchars = 0;
352        int totalchars = 0;
353        while (true) {
354            // demerit the current encoding for each "special" character found after conversion.
355            // The amount of demerit is somewhat arbitrarily chosen.
356            int inchar;
357            if (source != sourceLimit) {
358                inchar = (source[0] << 8) + source[1];
359            }
360            UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
361            if (!U_SUCCESS(status)) {
362                break;
363            }
364            if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
365                ALOGV("control character %x", c);
366                demerit += 100;
367            } else if ((c == 0xa0)                      // no-break space
368                    || (c >= 0xa2 && c <= 0xbe)         // symbols, superscripts
369                    || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
370                    || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
371                ALOGV("unlikely character %x", c);
372                demerit += 10;
373            } else if (c >= 0xe000 && c <= 0xf8ff) {
374                ALOGV("private use character %x", c);
375                demerit += 30;
376            } else if (c >= 0x2190 && c <= 0x2bff) {
377                // this range comprises various symbol ranges that are unlikely to appear in
378                // music file metadata.
379                ALOGV("symbol %x", c);
380                demerit += 10;
381            } else if (c == 0xfffd) {
382                ALOGV("replacement character");
383                demerit += 50;
384            } else if (c >= 0xfff0 && c <= 0xfffc) {
385                ALOGV("unicode special %x", c);
386                demerit += 50;
387            } else if (freqdata != NULL) {
388                totalchars++;
389                if (isFrequent(freqdata, c)) {
390                    frequentchars++;
391                }
392            }
393        }
394        if (freqdata != NULL && totalchars != 0) {
395            int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
396            ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
397                    totalchars, frequentchars);
398            if (myconfidence > 100) myconfidence = 100;
399            if (myconfidence < 0) myconfidence = 0;
400            confidence = myconfidence;
401        }
402        ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
403        newconfidence.push_back(confidence - demerit);
404        ucnv_close(conv);
405        if (i == 0 && (confidence - demerit) == 100) {
406            // no need to check any further, we'll end up using this match anyway
407            break;
408        }
409    }
410
411    // find match with highest confidence after adjusting for unlikely characters
412    int highest = newconfidence[0];
413    size_t highestidx = 0;
414    int runnerup = -10000;
415    int runnerupidx = -10000;
416    num = newconfidence.size();
417    for (size_t i = 1; i < num; i++) {
418        if (newconfidence[i] > highest) {
419            runnerup = highest;
420            runnerupidx = highestidx;
421            highest = newconfidence[i];
422            highestidx = i;
423        } else if (newconfidence[i] > runnerup){
424            runnerup = newconfidence[i];
425            runnerupidx = i;
426        }
427    }
428    status = U_ZERO_ERROR;
429    ALOGV("selecting: '%s' w/ %d confidence",
430            ucsdet_getName(matches[highestidx], &status), highest);
431    if (runnerupidx < 0) {
432        ALOGV("no runner up");
433        if (highest > 15) {
434            *goodmatch = true;
435        }
436    } else {
437        ALOGV("runner up: '%s' w/ %d confidence",
438                ucsdet_getName(matches[runnerupidx], &status), runnerup);
439        if (runnerup < 0) {
440            runnerup = 0;
441        }
442        if ((highest - runnerup) > 15) {
443            *goodmatch = true;
444        }
445    }
446    *highestmatch = highest;
447    return matches[highestidx];
448}
449
450
451bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
452
453    int start = 0;
454    int end = 511; // All the tables have 512 entries
455    int mid = (start+end)/2;
456
457    while(start <= end) {
458        if(c == values[mid]) {
459            return true;
460        } else if (c > values[mid]) {
461            start = mid + 1;
462        } else {
463            end = mid - 1;
464        }
465
466        mid = (start + end) / 2;
467    }
468
469    return false;
470}
471
472
473}  // namespace android
474