1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "lang_id/relevant-script-feature.h"
18
19#include <string>
20
21#include "common/feature-extractor.h"
22#include "common/feature-types.h"
23#include "common/task-context.h"
24#include "common/workspace.h"
25#include "lang_id/script-detector.h"
26#include "util/base/logging.h"
27#include "util/strings/utf8.h"
28
29namespace libtextclassifier {
30namespace nlp_core {
31namespace lang_id {
32
33bool RelevantScriptFeature::Setup(TaskContext *context) { return true; }
34
35bool RelevantScriptFeature::Init(TaskContext *context) {
36  set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
37  return true;
38}
39
40void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
41                                     const LightSentence &sentence,
42                                     FeatureVector *result) const {
43  // We expect kNumRelevantScripts to be small, so we stack-allocate the array
44  // of counts.  Still, if that changes, we want to find out.
45  static_assert(
46      kNumRelevantScripts < 25,
47      "switch counts to vector<int>: too big for stack-allocated int[]");
48
49  // counts[s] is the number of characters with script s.
50  // Note: {} "value-initializes" the array to zero.
51  int counts[kNumRelevantScripts]{};
52  int total_count = 0;
53  for (int i = 0; i < sentence.num_words(); ++i) {
54    const std::string &word = sentence.word(i);
55    const char *const word_end = word.data() + word.size();
56    const char *curr = word.data();
57
58    // Skip over token start '^'.
59    TC_DCHECK_EQ(*curr, '^');
60    curr += GetNumBytesForNonZeroUTF8Char(curr);
61    while (true) {
62      const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
63      Script script = GetScript(curr, num_bytes);
64
65      // We do this update and the if (...) break below *before* incrementing
66      // counts[script] in order to skip the token end '$'.
67      curr += num_bytes;
68      if (curr >= word_end) {
69        TC_DCHECK_EQ(*(curr - num_bytes), '$');
70        break;
71      }
72      TC_DCHECK_GE(script, 0);
73      TC_DCHECK_LT(script, kNumRelevantScripts);
74      counts[script]++;
75      total_count++;
76    }
77  }
78
79  for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
80    int count = counts[script_id];
81    if (count > 0) {
82      const float weight = static_cast<float>(count) / total_count;
83      FloatFeatureValue value(script_id, weight);
84      result->add(feature_type(), value.discrete_value);
85    }
86  }
87}
88
89}  // namespace lang_id
90}  // namespace nlp_core
91}  // namespace libtextclassifier
92