char_utils.h revision 79ba633402ceeebe216055cbd99a9e9701460f4a
1/* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#ifndef LATINIME_CHAR_UTILS_H 18#define LATINIME_CHAR_UTILS_H 19 20#include <cctype> 21#include <vector> 22 23#include "defines.h" 24 25namespace latinime { 26 27class CharUtils { 28 public: 29 static AK_FORCE_INLINE bool isAsciiUpper(int c) { 30 // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to 31 // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). 32 return (c >= 'A' && c <= 'Z'); 33 } 34 35 static AK_FORCE_INLINE int toAsciiLower(int c) { 36 return c - 'A' + 'a'; 37 } 38 39 static AK_FORCE_INLINE bool isAscii(int c) { 40 return isascii(c) != 0; 41 } 42 43 static AK_FORCE_INLINE int toLowerCase(const int c) { 44 if (isAsciiUpper(c)) { 45 return toAsciiLower(c); 46 } 47 if (isAscii(c)) { 48 return c; 49 } 50 return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); 51 } 52 53 static AK_FORCE_INLINE int toBaseLowerCase(const int c) { 54 return toLowerCase(toBaseCodePoint(c)); 55 } 56 57 static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) { 58 // TODO: Do not hardcode here 59 return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; 60 } 61 62 static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { 63 int size = 0; 64 for (; size < arraySize; ++size) { 65 if (codePoints[size] == '\0') { 66 break; 67 } 68 } 69 return size; 70 } 71 72 static AK_FORCE_INLINE int toBaseCodePoint(int c) { 73 if (c < BASE_CHARS_SIZE) { 74 return static_cast<int>(BASE_CHARS[c]); 75 } 76 return c; 77 } 78 79 static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) { 80 int spaceCount = 0; 81 for (int i = 0; i < length; ++i) { 82 if (codePointBuffer[i] == KEYCODE_SPACE) { 83 ++spaceCount; 84 } 85 } 86 return spaceCount; 87 } 88 89 static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { 90 return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; 91 } 92 93 static unsigned short latin_tolower(const unsigned short c); 94 static const std::vector<int> EMPTY_STRING; 95 96 private: 97 DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); 98 99 static const int MIN_UNICODE_CODE_POINT; 100 static const int MAX_UNICODE_CODE_POINT; 101 102 /** 103 * Table mapping most combined Latin, Greek, and Cyrillic characters 104 * to their base characters. If c is in range, BASE_CHARS[c] == c 105 * if c is not a combined character, or the base character if it 106 * is combined. 107 */ 108 static const int BASE_CHARS_SIZE = 0x0500; 109 static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; 110}; 111} // namespace latinime 112#endif // LATINIME_CHAR_UTILS_H 113