1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
18#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
19
20#include <string>
21#include <vector>
22
23#include "smartselect/tokenizer.pb.h"
24#include "smartselect/types.h"
25#include "util/base/integral_types.h"
26
27namespace libtextclassifier {
28
29// Tokenizer splits the input string into a sequence of tokens, according to the
30// configuration.
31class Tokenizer {
32 public:
33  explicit Tokenizer(
34      const std::vector<TokenizationCodepointRange>& codepoint_range_configs) {
35    PrepareTokenizationCodepointRanges(codepoint_range_configs);
36  }
37
38  // Tokenizes the input string using the selected tokenization method.
39  std::vector<Token> Tokenize(const std::string& utf8_text) const;
40
41 protected:
42  // Represents a codepoint range [start, end) with its role for tokenization.
43  struct CodepointRange {
44    int32 start;
45    int32 end;
46    TokenizationCodepointRange::Role role;
47
48    CodepointRange(int32 arg_start, int32 arg_end,
49                   TokenizationCodepointRange::Role arg_role)
50        : start(arg_start), end(arg_end), role(arg_role) {}
51  };
52
53  // Prepares tokenization codepoint ranges for use in tokenization.
54  void PrepareTokenizationCodepointRanges(
55      const std::vector<TokenizationCodepointRange>& codepoint_range_configs);
56
57  // Finds the tokenization role for given codepoint.
58  // If the character is not found returns DEFAULT_ROLE.
59  // Internally uses binary search so should be O(log(# of codepoint_ranges)).
60  TokenizationCodepointRange::Role FindTokenizationRole(int codepoint) const;
61
62 private:
63  // Codepoint ranges that determine how different codepoints are tokenized.
64  // The ranges must not overlap.
65  std::vector<CodepointRange> codepoint_ranges_;
66};
67
68}  // namespace libtextclassifier
69
70#endif  // LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
71