1bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi/* 2bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Copyright (C) 2017 The Android Open Source Project 3bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * 4bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Licensed under the Apache License, Version 2.0 (the "License"); 5bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * you may not use this file except in compliance with the License. 6bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * You may obtain a copy of the License at 7bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * 8bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * http://www.apache.org/licenses/LICENSE-2.0 9bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * 10bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Unless required by applicable law or agreed to in writing, software 11bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * distributed under the License is distributed on an "AS IS" BASIS, 12bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * See the License for the specific language governing permissions and 14bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * limitations under the License. 15bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi */ 16bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 17bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "lang_id/relevant-script-feature.h" 18bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 19bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include <string> 20bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 21bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/feature-extractor.h" 22bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/feature-types.h" 23bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/task-context.h" 24bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/workspace.h" 25bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "lang_id/script-detector.h" 26bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "util/base/logging.h" 27bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "util/strings/utf8.h" 28bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 29bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace libtextclassifier { 30bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace nlp_core { 31bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace lang_id { 32bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 33bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifibool RelevantScriptFeature::Setup(TaskContext *context) { return true; } 34bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 35bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifibool RelevantScriptFeature::Init(TaskContext *context) { 36bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts)); 37bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi return true; 38bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi} 39bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 40bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifivoid RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces, 41bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi const LightSentence &sentence, 42bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi FeatureVector *result) const { 43bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // We expect kNumRelevantScripts to be small, so we stack-allocate the array 44bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // of counts. Still, if that changes, we want to find out. 45bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi static_assert( 46bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi kNumRelevantScripts < 25, 47bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi "switch counts to vector<int>: too big for stack-allocated int[]"); 48bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 49bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // counts[s] is the number of characters with script s. 50bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // Note: {} "value-initializes" the array to zero. 51bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi int counts[kNumRelevantScripts]{}; 52bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi int total_count = 0; 53bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi for (int i = 0; i < sentence.num_words(); ++i) { 54bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi const std::string &word = sentence.word(i); 55bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi const char *const word_end = word.data() + word.size(); 56bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi const char *curr = word.data(); 57bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 58bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // Skip over token start '^'. 59bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi TC_DCHECK_EQ(*curr, '^'); 60bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi curr += GetNumBytesForNonZeroUTF8Char(curr); 61bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi while (true) { 62bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr); 63bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi Script script = GetScript(curr, num_bytes); 64bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 65bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // We do this update and the if (...) break below *before* incrementing 66bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi // counts[script] in order to skip the token end '$'. 67bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi curr += num_bytes; 68bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi if (curr >= word_end) { 69bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi TC_DCHECK_EQ(*(curr - num_bytes), '$'); 70bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi break; 71bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi } 72bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi TC_DCHECK_GE(script, 0); 73bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi TC_DCHECK_LT(script, kNumRelevantScripts); 74bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi counts[script]++; 75bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi total_count++; 76bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi } 77bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi } 78bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 79bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) { 80bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi int count = counts[script_id]; 81bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi if (count > 0) { 82bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi const float weight = static_cast<float>(count) / total_count; 83bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi FloatFeatureValue value(script_id, weight); 84bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi result->add(feature_type(), value.discrete_value); 85bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi } 86bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi } 87bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi} 88bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi 89bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi} // namespace lang_id 90bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi} // namespace nlp_core 91bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi} // namespace libtextclassifier 92