icu.h revision dfd8b8327b93660601d016cdc6f29f433b45a8d8
1// icu.h
2
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Copyright 2005-2010 Google, Inc.
16// Author: sorenj@google.com (Jeffrey Sorensen)
17//         roubert@google.com (Fredrik Roubert)
18//
19// This library implements an unrestricted Thompson/Pike UTF-8 parser and
20// serializer.  UTF-8 is a restricted subset of this byte stream encoding.  See
21// http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding
22// details.
23
24#ifndef FST_LIB_ICU_H_
25#define FST_LIB_ICU_H_
26
27#include <iostream>
28#include <fstream>
29#include <sstream>
30
31namespace fst {
32
33template <class Label>
34bool UTF8StringToLabels(const string &str, vector<Label> *labels) {
35  const char *data = str.data();
36  size_t length = str.size();
37  for (int i = 0; i < length; /* no update */) {
38    int c = data[i++] & 0xff;
39    if ((c & 0x80) == 0) {
40      labels->push_back(c);
41    } else {
42      if ((c & 0xc0) == 0x80) {
43        LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte";
44        return false;
45      }
46      int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) +
47                  (c >= 0xfc);
48      int code = c & ((1 << (6 - count)) - 1);
49      while (count != 0) {
50        if (i == length) {
51          LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence";
52          return false;
53        }
54        char cb = data[i++];
55        if ((cb & 0xc0) != 0x80) {
56          LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte";
57          return false;
58        }
59        code = (code << 6) | (cb & 0x3f);
60        count--;
61      }
62      if (code < 0) {
63        // This should not be able to happen.
64        LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
65        return false;
66      }
67      labels->push_back(code);
68    }
69  }
70  return true;
71}
72
73template <class Label>
74bool LabelsToUTF8String(const vector<Label> &labels, string *str) {
75  ostringstream ostr;
76  for (size_t i = 0; i < labels.size(); ++i) {
77    int32_t code = labels[i];
78    if (code < 0) {
79      LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code;
80      return false;
81    } else if (code < 0x80) {
82      ostr << static_cast<char>(code);
83    } else if (code < 0x800) {
84      ostr << static_cast<char>((code >> 6) | 0xc0);
85      ostr << static_cast<char>((code & 0x3f) | 0x80);
86    } else if (code < 0x10000) {
87      ostr << static_cast<char>((code >> 12) | 0xe0);
88      ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
89      ostr << static_cast<char>((code & 0x3f) | 0x80);
90    } else if (code < 0x200000) {
91      ostr << static_cast<char>((code >> 18) | 0xf0);
92      ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
93      ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
94      ostr << static_cast<char>((code & 0x3f) | 0x80);
95    } else if (code < 0x4000000) {
96      ostr << static_cast<char>((code >> 24) | 0xf8);
97      ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
98      ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
99      ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
100      ostr << static_cast<char>((code & 0x3f) | 0x80);
101    } else {
102      ostr << static_cast<char>((code >> 30) | 0xfc);
103      ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80);
104      ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
105      ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
106      ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
107      ostr << static_cast<char>((code & 0x3f) | 0x80);
108    }
109  }
110  *str = ostr.str();
111  return true;
112}
113
114}  // namespace fst
115
116#endif  // FST_LIB_ICU_H_
117