1 // util.h
2
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: riley@google.com (Michael Riley)
17 //
18 // \file
19 // FST utility inline definitions.
20
21 #ifndef FST_LIB_UTIL_H__
22 #define FST_LIB_UTIL_H__
23
24 #include <tr1/unordered_map>
25 using std::tr1::unordered_map;
26 using std::tr1::unordered_multimap;
27 #include <tr1/unordered_set>
28 using std::tr1::unordered_set;
29 using std::tr1::unordered_multiset;
30 #include <list>
31 #include <map>
32 #include <set>
33 #include <sstream>
34 #include <string>
35 #include <vector>
36 using std::vector;
37
38
39 #include <fst/compat.h>
40 #include <fst/types.h>
41
42 #include <iostream>
43 #include <fstream>
44 #include <sstream>
45
46 //
47 // UTILITY FOR ERROR HANDLING
48 //
49
50 DECLARE_bool(fst_error_fatal);
51
52 #define FSTERROR() (FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
53
54 namespace fst {
55
56 //
57 // UTILITIES FOR TYPE I/O
58 //
59
60 // Read some types from an input stream.
61
62 // Generic case.
63 template <typename T>
ReadType(istream & strm,T * t)64 inline istream &ReadType(istream &strm, T *t) {
65 return t->Read(strm);
66 }
67
68 // Fixed size, contiguous memory read.
69 #define READ_POD_TYPE(T) \
70 inline istream &ReadType(istream &strm, T *t) { \
71 return strm.read(reinterpret_cast<char *>(t), sizeof(T)); \
72 }
73
74 READ_POD_TYPE(bool);
75 READ_POD_TYPE(char);
76 READ_POD_TYPE(signed char);
77 READ_POD_TYPE(unsigned char);
78 READ_POD_TYPE(short);
79 READ_POD_TYPE(unsigned short);
80 READ_POD_TYPE(int);
81 READ_POD_TYPE(unsigned int);
82 READ_POD_TYPE(long);
83 READ_POD_TYPE(unsigned long);
84 READ_POD_TYPE(long long);
85 READ_POD_TYPE(unsigned long long);
86 READ_POD_TYPE(float);
87 READ_POD_TYPE(double);
88
89 // String case.
ReadType(istream & strm,string * s)90 inline istream &ReadType(istream &strm, string *s) {
91 s->clear();
92 int32 ns = 0;
93 strm.read(reinterpret_cast<char *>(&ns), sizeof(ns));
94 for (int i = 0; i < ns; ++i) {
95 char c;
96 strm.read(&c, 1);
97 *s += c;
98 }
99 return strm;
100 }
101
102 // Pair case.
103 template <typename S, typename T>
ReadType(istream & strm,pair<S,T> * p)104 inline istream &ReadType(istream &strm, pair<S, T> *p) {
105 ReadType(strm, &p->first);
106 ReadType(strm, &p->second);
107 return strm;
108 }
109
110 template <typename S, typename T>
ReadType(istream & strm,pair<const S,T> * p)111 inline istream &ReadType(istream &strm, pair<const S, T> *p) {
112 ReadType(strm, const_cast<S *>(&p->first));
113 ReadType(strm, &p->second);
114 return strm;
115 }
116
117 // General case - no-op.
118 template <typename C>
StlReserve(C * c,int64 n)119 void StlReserve(C *c, int64 n) {}
120
121 // Specialization for vectors.
122 template <typename S, typename T>
StlReserve(vector<S,T> * c,int64 n)123 void StlReserve(vector<S, T> *c, int64 n) {
124 c->reserve(n);
125 }
126
127 // STL sequence container.
128 #define READ_STL_SEQ_TYPE(C) \
129 template <typename S, typename T> \
130 inline istream &ReadType(istream &strm, C<S, T> *c) { \
131 c->clear(); \
132 int64 n = 0; \
133 strm.read(reinterpret_cast<char *>(&n), sizeof(n)); \
134 StlReserve(c, n); \
135 for (ssize_t i = 0; i < n; ++i) { \
136 typename C<S, T>::value_type value; \
137 ReadType(strm, &value); \
138 c->insert(c->end(), value); \
139 } \
140 return strm; \
141 }
142
143 READ_STL_SEQ_TYPE(vector);
144 READ_STL_SEQ_TYPE(list);
145
146 // STL associative container.
147 #define READ_STL_ASSOC_TYPE(C) \
148 template <typename S, typename T, typename U> \
149 inline istream &ReadType(istream &strm, C<S, T, U> *c) { \
150 c->clear(); \
151 int64 n = 0; \
152 strm.read(reinterpret_cast<char *>(&n), sizeof(n)); \
153 for (ssize_t i = 0; i < n; ++i) { \
154 typename C<S, T, U>::value_type value; \
155 ReadType(strm, &value); \
156 c->insert(value); \
157 } \
158 return strm; \
159 }
160
161 READ_STL_ASSOC_TYPE(set);
162 READ_STL_ASSOC_TYPE(unordered_set);
163 READ_STL_ASSOC_TYPE(map);
164 READ_STL_ASSOC_TYPE(unordered_map);
165
166 // Write some types to an output stream.
167
168 // Generic case.
169 template <typename T>
WriteType(ostream & strm,const T t)170 inline ostream &WriteType(ostream &strm, const T t) {
171 t.Write(strm);
172 return strm;
173 }
174
175 // Fixed size, contiguous memory write.
176 #define WRITE_POD_TYPE(T) \
177 inline ostream &WriteType(ostream &strm, const T t) { \
178 return strm.write(reinterpret_cast<const char *>(&t), sizeof(T)); \
179 }
180
181 WRITE_POD_TYPE(bool);
182 WRITE_POD_TYPE(char);
183 WRITE_POD_TYPE(signed char);
184 WRITE_POD_TYPE(unsigned char);
185 WRITE_POD_TYPE(short);
186 WRITE_POD_TYPE(unsigned short);
187 WRITE_POD_TYPE(int);
188 WRITE_POD_TYPE(unsigned int);
189 WRITE_POD_TYPE(long);
190 WRITE_POD_TYPE(unsigned long);
191 WRITE_POD_TYPE(long long);
192 WRITE_POD_TYPE(unsigned long long);
193 WRITE_POD_TYPE(float);
194 WRITE_POD_TYPE(double);
195
196 // String case.
WriteType(ostream & strm,const string & s)197 inline ostream &WriteType(ostream &strm, const string &s) {
198 int32 ns = s.size();
199 strm.write(reinterpret_cast<const char *>(&ns), sizeof(ns));
200 return strm.write(s.data(), ns);
201 }
202
203 // Pair case.
204 template <typename S, typename T>
WriteType(ostream & strm,const pair<S,T> & p)205 inline ostream &WriteType(ostream &strm, const pair<S, T> &p) {
206 WriteType(strm, p.first);
207 WriteType(strm, p.second);
208 return strm;
209 }
210
211 // STL sequence container.
212 #define WRITE_STL_SEQ_TYPE(C) \
213 template <typename S, typename T> \
214 inline ostream &WriteType(ostream &strm, const C<S, T> &c) { \
215 int64 n = c.size(); \
216 strm.write(reinterpret_cast<char *>(&n), sizeof(n)); \
217 for (typename C<S, T>::const_iterator it = c.begin(); \
218 it != c.end(); ++it) \
219 WriteType(strm, *it); \
220 return strm; \
221 }
222
223 WRITE_STL_SEQ_TYPE(vector);
224 WRITE_STL_SEQ_TYPE(list);
225
226 // STL associative container.
227 #define WRITE_STL_ASSOC_TYPE(C) \
228 template <typename S, typename T, typename U> \
229 inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) { \
230 int64 n = c.size(); \
231 strm.write(reinterpret_cast<char *>(&n), sizeof(n)); \
232 for (typename C<S, T, U>::const_iterator it = c.begin(); \
233 it != c.end(); ++it) \
234 WriteType(strm, *it); \
235 return strm; \
236 }
237
238 WRITE_STL_ASSOC_TYPE(set);
239 WRITE_STL_ASSOC_TYPE(unordered_set);
240 WRITE_STL_ASSOC_TYPE(map);
241 WRITE_STL_ASSOC_TYPE(unordered_map);
242
243 // Utilities for converting between int64 or Weight and string.
244
245 int64 StrToInt64(const string &s, const string &src, size_t nline,
246 bool allow_negative, bool *error = 0);
247
248 template <typename Weight>
StrToWeight(const string & s,const string & src,size_t nline)249 Weight StrToWeight(const string &s, const string &src, size_t nline) {
250 Weight w;
251 istringstream strm(s);
252 strm >> w;
253 if (!strm) {
254 FSTERROR() << "StrToWeight: Bad weight = \"" << s
255 << "\", source = " << src << ", line = " << nline;
256 return Weight::NoWeight();
257 }
258 return w;
259 }
260
261 void Int64ToStr(int64 n, string *s);
262
263 template <typename Weight>
WeightToStr(Weight w,string * s)264 void WeightToStr(Weight w, string *s) {
265 ostringstream strm;
266 strm.precision(9);
267 strm << w;
268 s->append(strm.str().data(), strm.str().size());
269 }
270
271 // Utilities for reading/writing integer pairs (typically labels)
272
273 // Returns true on success
274 template <typename I>
275 bool ReadIntPairs(const string& filename,
276 vector<pair<I, I> >* pairs,
277 bool allow_negative = false) {
278 ifstream strm(filename.c_str());
279
280 if (!strm) {
281 LOG(ERROR) << "ReadIntPairs: Can't open file: " << filename;
282 return false;
283 }
284
285 const int kLineLen = 8096;
286 char line[kLineLen];
287 size_t nline = 0;
288
289 pairs->clear();
290 while (strm.getline(line, kLineLen)) {
291 ++nline;
292 vector<char *> col;
293 SplitToVector(line, "\n\t ", &col, true);
294 // empty line or comment?
295 if (col.size() == 0 || col[0][0] == '\0' || col[0][0] == '#')
296 continue;
297 if (col.size() != 2) {
298 LOG(ERROR) << "ReadIntPairs: Bad number of columns, "
299 << "file = " << filename << ", line = " << nline;
300 return false;
301 }
302
303 bool err;
304 I i1 = StrToInt64(col[0], filename, nline, allow_negative, &err);
305 if (err) return false;
306 I i2 = StrToInt64(col[1], filename, nline, allow_negative, &err);
307 if (err) return false;
308 pairs->push_back(make_pair(i1, i2));
309 }
310 return true;
311 }
312
313 // Returns true on success
314 template <typename I>
WriteIntPairs(const string & filename,const vector<pair<I,I>> & pairs)315 bool WriteIntPairs(const string& filename,
316 const vector<pair<I, I> >& pairs) {
317 ostream *strm = &cout;
318 if (!filename.empty()) {
319 strm = new ofstream(filename.c_str());
320 if (!*strm) {
321 LOG(ERROR) << "WriteIntPairs: Can't open file: " << filename;
322 return false;
323 }
324 }
325
326 for (ssize_t n = 0; n < pairs.size(); ++n)
327 *strm << pairs[n].first << "\t" << pairs[n].second << "\n";
328
329 if (!*strm) {
330 LOG(ERROR) << "WriteIntPairs: Write failed: "
331 << (filename.empty() ? "standard output" : filename);
332 return false;
333 }
334 if (strm != &cout)
335 delete strm;
336 return true;
337 }
338
339 // Utilities for reading/writing label pairs
340
341 template <typename Label>
342 bool ReadLabelPairs(const string& filename,
343 vector<pair<Label, Label> >* pairs,
344 bool allow_negative = false) {
345 return ReadIntPairs(filename, pairs, allow_negative);
346 }
347
348 template <typename Label>
WriteLabelPairs(const string & filename,vector<pair<Label,Label>> & pairs)349 bool WriteLabelPairs(const string& filename,
350 vector<pair<Label, Label> >& pairs) {
351 return WriteIntPairs(filename, pairs);
352 }
353
354 // Utilities for converting a type name to a legal C symbol.
355
356 void ConvertToLegalCSymbol(string *s);
357
358
359 //
360 // UTILITIES FOR STREAM I/O
361 //
362
363 bool AlignInput(istream &strm);
364 bool AlignOutput(ostream &strm);
365
366 //
367 // UTILITIES FOR PROTOCOL BUFFER I/O
368 //
369
370
371 // An associative container for which testing membership is
372 // faster than an STL set if members are restricted to an interval
373 // that excludes most non-members. A 'Key' must have ==, !=, and < defined.
374 // Element 'NoKey' should be a key that marks an uninitialized key and
375 // is otherwise unused. 'Find()' returns an STL const_iterator to the match
376 // found, otherwise it equals 'End()'.
377 template <class Key, Key NoKey>
378 class CompactSet {
379 public:
380 typedef typename set<Key>::const_iterator const_iterator;
381
CompactSet()382 CompactSet()
383 : min_key_(NoKey),
384 max_key_(NoKey) { }
385
CompactSet(const CompactSet<Key,NoKey> & compact_set)386 CompactSet(const CompactSet<Key, NoKey> &compact_set)
387 : set_(compact_set.set_),
388 min_key_(compact_set.min_key_),
389 max_key_(compact_set.max_key_) { }
390
Insert(Key key)391 void Insert(Key key) {
392 set_.insert(key);
393 if (min_key_ == NoKey || key < min_key_)
394 min_key_ = key;
395 if (max_key_ == NoKey || max_key_ < key)
396 max_key_ = key;
397 }
398
Erase(Key key)399 void Erase(Key key) {
400 set_.erase(key);
401 if (set_.empty()) {
402 min_key_ = max_key_ = NoKey;
403 } else if (key == min_key_) {
404 ++min_key_;
405 } else if (key == max_key_) {
406 --max_key_;
407 }
408 }
409
Clear()410 void Clear() {
411 set_.clear();
412 min_key_ = max_key_ = NoKey;
413 }
414
Find(Key key)415 const_iterator Find(Key key) const {
416 if (min_key_ == NoKey ||
417 key < min_key_ || max_key_ < key)
418 return set_.end();
419 else
420 return set_.find(key);
421 }
422
Member(Key key)423 bool Member(Key key) const {
424 if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
425 return false; // out of range
426 } else if (min_key_ != NoKey && max_key_ + 1 == min_key_ + set_.size()) {
427 return true; // dense range
428 } else {
429 return set_.find(key) != set_.end();
430 }
431 }
432
Begin()433 const_iterator Begin() const { return set_.begin(); }
434
End()435 const_iterator End() const { return set_.end(); }
436
437 // All stored keys are greater than or equal to this value.
LowerBound()438 Key LowerBound() const { return min_key_; }
439
440 // All stored keys are less than or equal to this value.
UpperBound()441 Key UpperBound() const { return max_key_; }
442
443 private:
444 set<Key> set_;
445 Key min_key_;
446 Key max_key_;
447
448 void operator=(const CompactSet<Key, NoKey> &); //disallow
449 };
450
451 } // namespace fst
452
453 #endif // FST_LIB_UTIL_H__
454