1bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi/*
2bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Copyright (C) 2017 The Android Open Source Project
3bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *
4bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Licensed under the Apache License, Version 2.0 (the "License");
5bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * you may not use this file except in compliance with the License.
6bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * You may obtain a copy of the License at
7bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *
8bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *      http://www.apache.org/licenses/LICENSE-2.0
9bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi *
10bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * Unless required by applicable law or agreed to in writing, software
11bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * distributed under the License is distributed on an "AS IS" BASIS,
12bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * See the License for the specific language governing permissions and
14bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi * limitations under the License.
15bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi */
16bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
17bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "smartselect/tokenizer.h"
18bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
19bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "util/strings/utf8.h"
20bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi#include "util/utf8/unicodetext.h"
21bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
22bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifinamespace libtextclassifier {
23bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
24bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifivoid Tokenizer::PrepareTokenizationCodepointRanges(
2526e8c2edb55f8c4efc47bc586edd5f9506e281a7Lukas Zilka    const std::vector<TokenizationCodepointRange>& codepoint_range_configs) {
26bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  codepoint_ranges_.clear();
27bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  codepoint_ranges_.reserve(codepoint_range_configs.size());
28bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  for (const TokenizationCodepointRange& range : codepoint_range_configs) {
29bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    codepoint_ranges_.push_back(
30bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi        CodepointRange(range.start(), range.end(), range.role()));
31bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  }
32bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
33bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  std::sort(codepoint_ranges_.begin(), codepoint_ranges_.end(),
34bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi            [](const CodepointRange& a, const CodepointRange& b) {
35bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi              return a.start < b.start;
36bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi            });
37bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}
38bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
39bda09f1da39ce38a5ece4757b82a64776e53214cMatt SharifiTokenizationCodepointRange::Role Tokenizer::FindTokenizationRole(
40bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    int codepoint) const {
41bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  auto it = std::lower_bound(codepoint_ranges_.begin(), codepoint_ranges_.end(),
42bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                             codepoint,
43bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                             [](const CodepointRange& range, int codepoint) {
44bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // This function compares range with the
45bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // codepoint for the purpose of finding the first
46bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // greater or equal range. Because of the use of
47bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // std::lower_bound it needs to return true when
48bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // range < codepoint; the first time it will
49bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // return false the lower bound is found and
50bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // returned.
51bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               //
52bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               // It might seem weird that the condition is
5391f068673f5915dd4c12fac026e4a043e4a063a0Matt Sharifi                               // range.end <= codepoint here but when codepoint
5491f068673f5915dd4c12fac026e4a043e4a063a0Matt Sharifi                               // == range.end it means it's actually just
5591f068673f5915dd4c12fac026e4a043e4a063a0Matt Sharifi                               // outside of the range, thus the range is less
5691f068673f5915dd4c12fac026e4a043e4a063a0Matt Sharifi                               // than the codepoint.
57bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                               return range.end <= codepoint;
58bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi                             });
59bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  if (it != codepoint_ranges_.end() && it->start <= codepoint &&
60bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      it->end > codepoint) {
61bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    return it->role;
62bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  } else {
63bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    return TokenizationCodepointRange::DEFAULT_ROLE;
64bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  }
65bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}
66bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
67bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifistd::vector<Token> Tokenizer::Tokenize(const std::string& utf8_text) const {
68bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  UnicodeText context_unicode = UTF8ToUnicodeText(utf8_text, /*do_copy=*/false);
69bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
70bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  std::vector<Token> result;
71bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  Token new_token("", 0, 0);
72bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  int codepoint_index = 0;
73bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  for (auto it = context_unicode.begin(); it != context_unicode.end();
74bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi       ++it, ++codepoint_index) {
75bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    TokenizationCodepointRange::Role role = FindTokenizationRole(*it);
76bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    if (role & TokenizationCodepointRange::SPLIT_BEFORE) {
77bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      if (!new_token.value.empty()) {
78bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi        result.push_back(new_token);
79bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      }
80bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      new_token = Token("", codepoint_index, codepoint_index);
81bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    }
82bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    if (!(role & TokenizationCodepointRange::DISCARD_CODEPOINT)) {
83bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      new_token.value += std::string(
84bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi          it.utf8_data(),
85bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi          it.utf8_data() + GetNumBytesForNonZeroUTF8Char(it.utf8_data()));
86bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      ++new_token.end;
87bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    }
88bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    if (role & TokenizationCodepointRange::SPLIT_AFTER) {
89bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      if (!new_token.value.empty()) {
90bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi        result.push_back(new_token);
91bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      }
92bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi      new_token = Token("", codepoint_index + 1, codepoint_index + 1);
93bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    }
94bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  }
95bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  if (!new_token.value.empty()) {
96bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi    result.push_back(new_token);
97bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  }
98bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
99bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi  return result;
100bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}
101bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi
102bda09f1da39ce38a5ece4757b82a64776e53214cMatt Sharifi}  // namespace libtextclassifier
103