1/* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "smartselect/tokenizer.h" 18 19#include "util/strings/utf8.h" 20#include "util/utf8/unicodetext.h" 21 22namespace libtextclassifier { 23 24void Tokenizer::PrepareTokenizationCodepointRanges( 25 const std::vector<TokenizationCodepointRange>& codepoint_range_configs) { 26 codepoint_ranges_.clear(); 27 codepoint_ranges_.reserve(codepoint_range_configs.size()); 28 for (const TokenizationCodepointRange& range : codepoint_range_configs) { 29 codepoint_ranges_.push_back( 30 CodepointRange(range.start(), range.end(), range.role())); 31 } 32 33 std::sort(codepoint_ranges_.begin(), codepoint_ranges_.end(), 34 [](const CodepointRange& a, const CodepointRange& b) { 35 return a.start < b.start; 36 }); 37} 38 39TokenizationCodepointRange::Role Tokenizer::FindTokenizationRole( 40 int codepoint) const { 41 auto it = std::lower_bound(codepoint_ranges_.begin(), codepoint_ranges_.end(), 42 codepoint, 43 [](const CodepointRange& range, int codepoint) { 44 // This function compares range with the 45 // codepoint for the purpose of finding the first 46 // greater or equal range. Because of the use of 47 // std::lower_bound it needs to return true when 48 // range < codepoint; the first time it will 49 // return false the lower bound is found and 50 // returned. 51 // 52 // It might seem weird that the condition is 53 // range.end <= codepoint here but when codepoint 54 // == range.end it means it's actually just 55 // outside of the range, thus the range is less 56 // than the codepoint. 57 return range.end <= codepoint; 58 }); 59 if (it != codepoint_ranges_.end() && it->start <= codepoint && 60 it->end > codepoint) { 61 return it->role; 62 } else { 63 return TokenizationCodepointRange::DEFAULT_ROLE; 64 } 65} 66 67std::vector<Token> Tokenizer::Tokenize(const std::string& utf8_text) const { 68 UnicodeText context_unicode = UTF8ToUnicodeText(utf8_text, /*do_copy=*/false); 69 70 std::vector<Token> result; 71 Token new_token("", 0, 0); 72 int codepoint_index = 0; 73 for (auto it = context_unicode.begin(); it != context_unicode.end(); 74 ++it, ++codepoint_index) { 75 TokenizationCodepointRange::Role role = FindTokenizationRole(*it); 76 if (role & TokenizationCodepointRange::SPLIT_BEFORE) { 77 if (!new_token.value.empty()) { 78 result.push_back(new_token); 79 } 80 new_token = Token("", codepoint_index, codepoint_index); 81 } 82 if (!(role & TokenizationCodepointRange::DISCARD_CODEPOINT)) { 83 new_token.value += std::string( 84 it.utf8_data(), 85 it.utf8_data() + GetNumBytesForNonZeroUTF8Char(it.utf8_data())); 86 ++new_token.end; 87 } 88 if (role & TokenizationCodepointRange::SPLIT_AFTER) { 89 if (!new_token.value.empty()) { 90 result.push_back(new_token); 91 } 92 new_token = Token("", codepoint_index + 1, codepoint_index + 1); 93 } 94 } 95 if (!new_token.value.empty()) { 96 result.push_back(new_token); 97 } 98 99 return result; 100} 101 102} // namespace libtextclassifier 103