1f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* 2f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project 3f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * 4f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * you may not use this file except in compliance with the License. 6f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * You may obtain a copy of the License at 7f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * 8f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * 10f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * See the License for the specific language governing permissions and 14f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * limitations under the License. 15f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi */ 16f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 17f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H 18f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H 19f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 20f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include <cstdint> 21f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include <string> 22f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include <vector> 23f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 24f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include "dict_toolkit_defines.h" 25f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include "utils/int_array_view.h" 26f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 27f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanaginamespace latinime { 28f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanaginamespace dicttoolkit { 29f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 30f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiclass Utf8Utils { 31f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagipublic: 32f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static std::vector<int> getCodePoints(const std::string &utf8Str); 33f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static std::string getUtf8String(const CodePointArrayView codePoints); 34f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 35f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiprivate: 36f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils); 37f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 38f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi // Values indexed by sequence size. 39f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; 40f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const uint8_t FIRST_BYTE_MARKER_MASKS[]; 41f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const uint8_t FIRST_BYTE_MARKERS[]; 42f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[]; 43f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const int MAX_ENCODED_CODE_POINT_VALUES[]; 44f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 45f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK; 46f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const uint8_t TRAILING_BYTE_MARKER; 47f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; 48f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 49f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte); 50f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static int maskFirstByte(const uint8_t firstByte, const int encodeSize); 51f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static int maskTrailingByte(const uint8_t secondOrLaterByte); 52f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi static int getSequenceSizeToEncodeCodePoint(const int codePoint); 53f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}; 54f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} // namespace dicttoolkit 55f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} // namespace latinime 56f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H 57