1bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi/*
2bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Copyright (C) 2017 The Android Open Source Project
3bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *
4bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Licensed under the Apache License, Version 2.0 (the "License");
5bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * you may not use this file except in compliance with the License.
6bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * You may obtain a copy of the License at
7bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *
8bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *      http://www.apache.org/licenses/LICENSE-2.0
9bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *
10bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Unless required by applicable law or agreed to in writing, software
11bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * distributed under the License is distributed on an "AS IS" BASIS,
12bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * See the License for the specific language governing permissions and
14bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * limitations under the License.
15bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi */
16bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
17bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "lang_id/relevant-script-feature.h"
18bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
19bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include <string>
20bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
21bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/feature-extractor.h"
22bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/feature-types.h"
23bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/task-context.h"
24bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "common/workspace.h"
25bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "lang_id/script-detector.h"
26bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "util/base/logging.h"
27bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "util/strings/utf8.h"
28bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
29bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace libtextclassifier {
30bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace nlp_core {
31bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace lang_id {
32bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
33bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifibool RelevantScriptFeature::Setup(TaskContext *context) { return true; }
34bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
35bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifibool RelevantScriptFeature::Init(TaskContext *context) {
36bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
37bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  return true;
38bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}
39bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
40bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifivoid RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
41bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                                     const LightSentence &sentence,
42bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                                     FeatureVector *result) const {
43bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  // We expect kNumRelevantScripts to be small, so we stack-allocate the array
44bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  // of counts.  Still, if that changes, we want to find out.
45bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  static_assert(
46bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      kNumRelevantScripts < 25,
47bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      "switch counts to vector<int>: too big for stack-allocated int[]");
48bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
49bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  // counts[s] is the number of characters with script s.
50bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  // Note: {} "value-initializes" the array to zero.
51bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  int counts[kNumRelevantScripts]{};
52bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  int total_count = 0;
53bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  for (int i = 0; i < sentence.num_words(); ++i) {
54bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    const std::string &word = sentence.word(i);
55bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    const char *const word_end = word.data() + word.size();
56bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    const char *curr = word.data();
57bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
58bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    // Skip over token start '^'.
59bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    TC_DCHECK_EQ(*curr, '^');
60bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    curr += GetNumBytesForNonZeroUTF8Char(curr);
61bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    while (true) {
62bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
63bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      Script script = GetScript(curr, num_bytes);
64bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
65bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      // We do this update and the if (...) break below *before* incrementing
66bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      // counts[script] in order to skip the token end '$'.
67bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      curr += num_bytes;
68bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      if (curr >= word_end) {
69bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi        TC_DCHECK_EQ(*(curr - num_bytes), '$');
70bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi        break;
71bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      }
72bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      TC_DCHECK_GE(script, 0);
73bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      TC_DCHECK_LT(script, kNumRelevantScripts);
74bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      counts[script]++;
75bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      total_count++;
76bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    }
77bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  }
78bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
79bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
80bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    int count = counts[script_id];
81bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    if (count > 0) {
82bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      const float weight = static_cast<float>(count) / total_count;
83bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      FloatFeatureValue value(script_id, weight);
84bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      result->add(feature_type(), value.discrete_value);
85bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    }
86bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  }
87bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}
88bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
89bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}  // namespace lang_id
90bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}  // namespace nlp_core
91bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}  // namespace libtextclassifier
92