1/**
2 * Copyright 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17// Routines to do manipulation of Unicode characters or text
18//
19// The StructurallyValid routines accept buffers of arbitrary bytes.
20// For CoerceToStructurallyValid(), the input buffer and output buffers may
21// point to exactly the same memory.
22//
23// In all other cases, the UTF-8 string must be structurally valid and
24// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
25// Debug builds take a fatal error for invalid UTF-8 input.
26// The input and output buffers may not overlap at all.
27//
28// The char32 routines are here only for convenience; they convert to UTF-8
29// internally and use the UTF-8 routines.
30
31#ifndef UTIL_UTF8_UNILIB_H__
32#define UTIL_UTF8_UNILIB_H__
33
34#include <string>
35#include "phonenumbers/base/basictypes.h"
36
37namespace i18n {
38namespace phonenumbers {
39namespace UniLib {
40
41// Returns true unless a surrogate code point
42inline bool IsValidCodepoint(char32 c) {
43  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
44  return (static_cast<uint32>(c) < 0xD800)
45    || (c >= 0xE000 && c <= 0x10FFFF);
46}
47
48// Table of UTF-8 character lengths, based on first byte
49static const unsigned char kUTF8LenTbl[256] = {
50  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
51  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
52  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
53  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
54
55  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
56  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
57  2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
58  3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
59};
60
61// Return length of a single UTF-8 source character
62inline int OneCharLen(const char* src) {
63  return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
64}
65
66// Return length of a single UTF-8 source character
67inline int OneCharLen(const uint8* src) {
68  return kUTF8LenTbl[*src];
69}
70
71// Return true if this byte is a trailing UTF-8 byte (10xx xxxx)
72inline bool IsTrailByte(char x) {
73  // return (x & 0xC0) == 0x80;
74  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
75  return static_cast<signed char>(x) < -0x40;
76}
77
78// Returns the length in bytes of the prefix of src that is all
79//  interchange valid UTF-8
80int SpanInterchangeValid(const char* src, int byte_length);
81inline int SpanInterchangeValid(const std::string& src) {
82  return SpanInterchangeValid(src.data(), src.size());
83}
84
85// Returns true if the source is all interchange valid UTF-8
86// "Interchange valid" is a stronger than structurally valid --
87// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
88inline bool IsInterchangeValid(const char* src, int byte_length) {
89  return (byte_length == SpanInterchangeValid(src, byte_length));
90}
91inline bool IsInterchangeValid(const std::string& src) {
92  return IsInterchangeValid(src.data(), src.size());
93}
94
95}  // namespace UniLib
96}  // namespace phonenumbers
97}  // namespace i18n
98
99#endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
100