1/* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "lang_id/relevant-script-feature.h" 18 19#include <string> 20 21#include "common/feature-extractor.h" 22#include "common/feature-types.h" 23#include "common/task-context.h" 24#include "common/workspace.h" 25#include "lang_id/script-detector.h" 26#include "util/base/logging.h" 27#include "util/strings/utf8.h" 28 29namespace libtextclassifier { 30namespace nlp_core { 31namespace lang_id { 32 33bool RelevantScriptFeature::Setup(TaskContext *context) { return true; } 34 35bool RelevantScriptFeature::Init(TaskContext *context) { 36 set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts)); 37 return true; 38} 39 40void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces, 41 const LightSentence &sentence, 42 FeatureVector *result) const { 43 // We expect kNumRelevantScripts to be small, so we stack-allocate the array 44 // of counts. Still, if that changes, we want to find out. 45 static_assert( 46 kNumRelevantScripts < 25, 47 "switch counts to vector<int>: too big for stack-allocated int[]"); 48 49 // counts[s] is the number of characters with script s. 50 // Note: {} "value-initializes" the array to zero. 51 int counts[kNumRelevantScripts]{}; 52 int total_count = 0; 53 for (int i = 0; i < sentence.num_words(); ++i) { 54 const std::string &word = sentence.word(i); 55 const char *const word_end = word.data() + word.size(); 56 const char *curr = word.data(); 57 58 // Skip over token start '^'. 59 TC_DCHECK_EQ(*curr, '^'); 60 curr += GetNumBytesForNonZeroUTF8Char(curr); 61 while (true) { 62 const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr); 63 Script script = GetScript(curr, num_bytes); 64 65 // We do this update and the if (...) break below *before* incrementing 66 // counts[script] in order to skip the token end '$'. 67 curr += num_bytes; 68 if (curr >= word_end) { 69 TC_DCHECK_EQ(*(curr - num_bytes), '$'); 70 break; 71 } 72 TC_DCHECK_GE(script, 0); 73 TC_DCHECK_LT(script, kNumRelevantScripts); 74 counts[script]++; 75 total_count++; 76 } 77 } 78 79 for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) { 80 int count = counts[script_id]; 81 if (count > 0) { 82 const float weight = static_cast<float>(count) / total_count; 83 FloatFeatureValue value(script_id, weight); 84 result->add(feature_type(), value.discrete_value); 85 } 86 } 87} 88 89} // namespace lang_id 90} // namespace nlp_core 91} // namespace libtextclassifier 92