1 // icu.h
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: sorenj@google.com (Jeffrey Sorensen)
17 //         roubert@google.com (Fredrik Roubert)
18 //
19 // This library implements an unrestricted Thompson/Pike UTF-8 parser and
20 // serializer.  UTF-8 is a restricted subset of this byte stream encoding.  See
21 // http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding
22 // details.
23 
24 #ifndef FST_LIB_ICU_H_
25 #define FST_LIB_ICU_H_
26 
27 #include <iostream>
28 #include <fstream>
29 #include <sstream>
30 
31 namespace fst {
32 
33 template <class Label>
UTF8StringToLabels(const string & str,vector<Label> * labels)34 bool UTF8StringToLabels(const string &str, vector<Label> *labels) {
35   const char *data = str.data();
36   size_t length = str.size();
37   for (int i = 0; i < length; /* no update */) {
38     int c = data[i++] & 0xff;
39     if ((c & 0x80) == 0) {
40       labels->push_back(c);
41     } else {
42       if ((c & 0xc0) == 0x80) {
43         LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte";
44         return false;
45       }
46       int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) +
47                   (c >= 0xfc);
48       int code = c & ((1 << (6 - count)) - 1);
49       while (count != 0) {
50         if (i == length) {
51           LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence";
52           return false;
53         }
54         char cb = data[i++];
55         if ((cb & 0xc0) != 0x80) {
56           LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte";
57           return false;
58         }
59         code = (code << 6) | (cb & 0x3f);
60         count--;
61       }
62       if (code < 0) {
63         // This should not be able to happen.
64         LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
65         return false;
66       }
67       labels->push_back(code);
68     }
69   }
70   return true;
71 }
72 
73 template <class Label>
LabelsToUTF8String(const vector<Label> & labels,string * str)74 bool LabelsToUTF8String(const vector<Label> &labels, string *str) {
75   ostringstream ostr;
76   for (size_t i = 0; i < labels.size(); ++i) {
77     int32_t code = labels[i];
78     if (code < 0) {
79       LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code;
80       return false;
81     } else if (code < 0x80) {
82       ostr << static_cast<char>(code);
83     } else if (code < 0x800) {
84       ostr << static_cast<char>((code >> 6) | 0xc0);
85       ostr << static_cast<char>((code & 0x3f) | 0x80);
86     } else if (code < 0x10000) {
87       ostr << static_cast<char>((code >> 12) | 0xe0);
88       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
89       ostr << static_cast<char>((code & 0x3f) | 0x80);
90     } else if (code < 0x200000) {
91       ostr << static_cast<char>((code >> 18) | 0xf0);
92       ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
93       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
94       ostr << static_cast<char>((code & 0x3f) | 0x80);
95     } else if (code < 0x4000000) {
96       ostr << static_cast<char>((code >> 24) | 0xf8);
97       ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
98       ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
99       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
100       ostr << static_cast<char>((code & 0x3f) | 0x80);
101     } else {
102       ostr << static_cast<char>((code >> 30) | 0xfc);
103       ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80);
104       ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
105       ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
106       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
107       ostr << static_cast<char>((code & 0x3f) | 0x80);
108     }
109   }
110   *str = ostr.str();
111   return true;
112 }
113 
114 }  // namespace fst
115 
116 #endif  // FST_LIB_ICU_H_
117