1 // util.h
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: riley@google.com (Michael Riley)
17 //
18 // \file
19 // FST utility inline definitions.
20 
21 #ifndef FST_LIB_UTIL_H__
22 #define FST_LIB_UTIL_H__
23 
24 #include <tr1/unordered_map>
25 using std::tr1::unordered_map;
26 using std::tr1::unordered_multimap;
27 #include <tr1/unordered_set>
28 using std::tr1::unordered_set;
29 using std::tr1::unordered_multiset;
30 #include <list>
31 #include <map>
32 #include <set>
33 #include <sstream>
34 #include <string>
35 #include <vector>
36 using std::vector;
37 
38 
39 #include <fst/compat.h>
40 #include <fst/types.h>
41 
42 #include <iostream>
43 #include <fstream>
44 #include <sstream>
45 
46 //
47 // UTILITY FOR ERROR HANDLING
48 //
49 
50 DECLARE_bool(fst_error_fatal);
51 
52 #define FSTERROR() (FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
53 
54 namespace fst {
55 
56 //
57 // UTILITIES FOR TYPE I/O
58 //
59 
60 // Read some types from an input stream.
61 
62 // Generic case.
63 template <typename T>
ReadType(istream & strm,T * t)64 inline istream &ReadType(istream &strm, T *t) {
65   return t->Read(strm);
66 }
67 
68 // Fixed size, contiguous memory read.
69 #define READ_POD_TYPE(T)                                    \
70 inline istream &ReadType(istream &strm, T *t) {             \
71   return strm.read(reinterpret_cast<char *>(t), sizeof(T)); \
72 }
73 
74 READ_POD_TYPE(bool);
75 READ_POD_TYPE(char);
76 READ_POD_TYPE(signed char);
77 READ_POD_TYPE(unsigned char);
78 READ_POD_TYPE(short);
79 READ_POD_TYPE(unsigned short);
80 READ_POD_TYPE(int);
81 READ_POD_TYPE(unsigned int);
82 READ_POD_TYPE(long);
83 READ_POD_TYPE(unsigned long);
84 READ_POD_TYPE(long long);
85 READ_POD_TYPE(unsigned long long);
86 READ_POD_TYPE(float);
87 READ_POD_TYPE(double);
88 
89 // String case.
ReadType(istream & strm,string * s)90 inline istream &ReadType(istream &strm, string *s) {
91   s->clear();
92   int32 ns = 0;
93   strm.read(reinterpret_cast<char *>(&ns), sizeof(ns));
94   for (int i = 0; i < ns; ++i) {
95     char c;
96     strm.read(&c, 1);
97     *s += c;
98   }
99   return strm;
100 }
101 
102 // Pair case.
103 template <typename S, typename T>
ReadType(istream & strm,pair<S,T> * p)104 inline istream &ReadType(istream &strm, pair<S, T> *p) {
105   ReadType(strm, &p->first);
106   ReadType(strm, &p->second);
107   return strm;
108 }
109 
110 template <typename S, typename T>
ReadType(istream & strm,pair<const S,T> * p)111 inline istream &ReadType(istream &strm, pair<const S, T> *p) {
112   ReadType(strm, const_cast<S *>(&p->first));
113   ReadType(strm, &p->second);
114   return strm;
115 }
116 
117 // General case - no-op.
118 template <typename C>
StlReserve(C * c,int64 n)119 void StlReserve(C *c, int64 n) {}
120 
121 // Specialization for vectors.
122 template <typename S, typename T>
StlReserve(vector<S,T> * c,int64 n)123 void StlReserve(vector<S, T> *c, int64 n) {
124   c->reserve(n);
125 }
126 
127 // STL sequence container.
128 #define READ_STL_SEQ_TYPE(C)                             \
129 template <typename S, typename T>                        \
130 inline istream &ReadType(istream &strm, C<S, T> *c) {    \
131   c->clear();                                            \
132   int64 n = 0;                                           \
133   strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
134   StlReserve(c, n);                                      \
135   for (ssize_t i = 0; i < n; ++i) {                      \
136     typename C<S, T>::value_type value;                  \
137     ReadType(strm, &value);                              \
138     c->insert(c->end(), value);                          \
139   }                                                      \
140   return strm;                                           \
141 }
142 
143 READ_STL_SEQ_TYPE(vector);
144 READ_STL_SEQ_TYPE(list);
145 
146 // STL associative container.
147 #define READ_STL_ASSOC_TYPE(C)                           \
148 template <typename S, typename T, typename U>            \
149 inline istream &ReadType(istream &strm, C<S, T, U> *c) { \
150   c->clear();                                            \
151   int64 n = 0;                                           \
152   strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
153   for (ssize_t i = 0; i < n; ++i) {                      \
154     typename C<S, T, U>::value_type value;               \
155     ReadType(strm, &value);                              \
156     c->insert(value);                                    \
157   }                                                      \
158   return strm;                                           \
159 }
160 
161 READ_STL_ASSOC_TYPE(set);
162 READ_STL_ASSOC_TYPE(unordered_set);
163 READ_STL_ASSOC_TYPE(map);
164 READ_STL_ASSOC_TYPE(unordered_map);
165 
166 // Write some types to an output stream.
167 
168 // Generic case.
169 template <typename T>
WriteType(ostream & strm,const T t)170 inline ostream &WriteType(ostream &strm, const T t) {
171   t.Write(strm);
172   return strm;
173 }
174 
175 // Fixed size, contiguous memory write.
176 #define WRITE_POD_TYPE(T)                                           \
177 inline ostream &WriteType(ostream &strm, const T t) {               \
178   return strm.write(reinterpret_cast<const char *>(&t), sizeof(T)); \
179 }
180 
181 WRITE_POD_TYPE(bool);
182 WRITE_POD_TYPE(char);
183 WRITE_POD_TYPE(signed char);
184 WRITE_POD_TYPE(unsigned char);
185 WRITE_POD_TYPE(short);
186 WRITE_POD_TYPE(unsigned short);
187 WRITE_POD_TYPE(int);
188 WRITE_POD_TYPE(unsigned int);
189 WRITE_POD_TYPE(long);
190 WRITE_POD_TYPE(unsigned long);
191 WRITE_POD_TYPE(long long);
192 WRITE_POD_TYPE(unsigned long long);
193 WRITE_POD_TYPE(float);
194 WRITE_POD_TYPE(double);
195 
196 // String case.
WriteType(ostream & strm,const string & s)197 inline ostream &WriteType(ostream &strm, const string &s) {
198   int32 ns = s.size();
199   strm.write(reinterpret_cast<const char *>(&ns), sizeof(ns));
200   return strm.write(s.data(), ns);
201 }
202 
203 // Pair case.
204 template <typename S, typename T>
WriteType(ostream & strm,const pair<S,T> & p)205 inline ostream &WriteType(ostream &strm, const pair<S, T> &p) {
206   WriteType(strm, p.first);
207   WriteType(strm, p.second);
208   return strm;
209 }
210 
211 // STL sequence container.
212 #define WRITE_STL_SEQ_TYPE(C)                                                \
213 template <typename S, typename T>                                            \
214 inline ostream &WriteType(ostream &strm, const C<S, T> &c) {                 \
215   int64 n = c.size();                                                        \
216   strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
217   for (typename C<S, T>::const_iterator it = c.begin();                      \
218        it != c.end(); ++it)                                                  \
219      WriteType(strm, *it);                                                   \
220   return strm;                                                               \
221 }
222 
223 WRITE_STL_SEQ_TYPE(vector);
224 WRITE_STL_SEQ_TYPE(list);
225 
226 // STL associative container.
227 #define WRITE_STL_ASSOC_TYPE(C)                                              \
228 template <typename S, typename T, typename U>                                \
229 inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) {              \
230   int64 n = c.size();                                                        \
231   strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
232   for (typename C<S, T, U>::const_iterator it = c.begin();                   \
233        it != c.end(); ++it)                                                  \
234      WriteType(strm, *it);                                                   \
235   return strm;                                                               \
236 }
237 
238 WRITE_STL_ASSOC_TYPE(set);
239 WRITE_STL_ASSOC_TYPE(unordered_set);
240 WRITE_STL_ASSOC_TYPE(map);
241 WRITE_STL_ASSOC_TYPE(unordered_map);
242 
243 // Utilities for converting between int64 or Weight and string.
244 
245 int64 StrToInt64(const string &s, const string &src, size_t nline,
246                  bool allow_negative, bool *error = 0);
247 
248 template <typename Weight>
StrToWeight(const string & s,const string & src,size_t nline)249 Weight StrToWeight(const string &s, const string &src, size_t nline) {
250   Weight w;
251   istringstream strm(s);
252   strm >> w;
253   if (!strm) {
254     FSTERROR() << "StrToWeight: Bad weight = \"" << s
255                << "\", source = " << src << ", line = " << nline;
256     return Weight::NoWeight();
257   }
258   return w;
259 }
260 
261 void Int64ToStr(int64 n, string *s);
262 
263 template <typename Weight>
WeightToStr(Weight w,string * s)264 void WeightToStr(Weight w, string *s) {
265   ostringstream strm;
266   strm.precision(9);
267   strm << w;
268   s->append(strm.str().data(), strm.str().size());
269 }
270 
271 // Utilities for reading/writing integer pairs (typically labels)
272 
273 // Returns true on success
274 template <typename I>
275 bool ReadIntPairs(const string& filename,
276                     vector<pair<I, I> >* pairs,
277                     bool allow_negative = false) {
278   ifstream strm(filename.c_str());
279 
280   if (!strm) {
281     LOG(ERROR) << "ReadIntPairs: Can't open file: " << filename;
282     return false;
283   }
284 
285   const int kLineLen = 8096;
286   char line[kLineLen];
287   size_t nline = 0;
288 
289   pairs->clear();
290   while (strm.getline(line, kLineLen)) {
291     ++nline;
292     vector<char *> col;
293     SplitToVector(line, "\n\t ", &col, true);
294     // empty line or comment?
295     if (col.size() == 0 || col[0][0] == '\0' || col[0][0] == '#')
296       continue;
297     if (col.size() != 2) {
298       LOG(ERROR) << "ReadIntPairs: Bad number of columns, "
299                  << "file = " << filename << ", line = " << nline;
300       return false;
301     }
302 
303     bool err;
304     I i1 = StrToInt64(col[0], filename, nline, allow_negative, &err);
305     if (err) return false;
306     I i2 = StrToInt64(col[1], filename, nline, allow_negative, &err);
307     if (err) return false;
308     pairs->push_back(make_pair(i1, i2));
309   }
310   return true;
311 }
312 
313 // Returns true on success
314 template <typename I>
WriteIntPairs(const string & filename,const vector<pair<I,I>> & pairs)315 bool WriteIntPairs(const string& filename,
316                    const vector<pair<I, I> >& pairs) {
317   ostream *strm = &cout;
318   if (!filename.empty()) {
319     strm = new ofstream(filename.c_str());
320     if (!*strm) {
321       LOG(ERROR) << "WriteIntPairs: Can't open file: " << filename;
322       return false;
323     }
324   }
325 
326   for (ssize_t n = 0; n < pairs.size(); ++n)
327     *strm << pairs[n].first << "\t" << pairs[n].second << "\n";
328 
329   if (!*strm) {
330     LOG(ERROR) << "WriteIntPairs: Write failed: "
331                << (filename.empty() ? "standard output" : filename);
332     return false;
333   }
334   if (strm != &cout)
335     delete strm;
336   return true;
337 }
338 
339 // Utilities for reading/writing label pairs
340 
341 template <typename Label>
342 bool ReadLabelPairs(const string& filename,
343                     vector<pair<Label, Label> >* pairs,
344                     bool allow_negative = false) {
345   return ReadIntPairs(filename, pairs, allow_negative);
346 }
347 
348 template <typename Label>
WriteLabelPairs(const string & filename,vector<pair<Label,Label>> & pairs)349 bool WriteLabelPairs(const string& filename,
350                      vector<pair<Label, Label> >& pairs) {
351   return WriteIntPairs(filename, pairs);
352 }
353 
354 // Utilities for converting a type name to a legal C symbol.
355 
356 void ConvertToLegalCSymbol(string *s);
357 
358 
359 //
360 // UTILITIES FOR STREAM I/O
361 //
362 
363 bool AlignInput(istream &strm);
364 bool AlignOutput(ostream &strm);
365 
366 //
367 // UTILITIES FOR PROTOCOL BUFFER I/O
368 //
369 
370 
371 // An associative container for which testing membership is
372 // faster than an STL set if members are restricted to an interval
373 // that excludes most non-members. A 'Key' must have ==, !=, and < defined.
374 // Element 'NoKey' should be a key that marks an uninitialized key and
375 // is otherwise unused. 'Find()' returns an STL const_iterator to the match
376 // found, otherwise it equals 'End()'.
377 template <class Key, Key NoKey>
378 class CompactSet {
379 public:
380   typedef typename set<Key>::const_iterator const_iterator;
381 
CompactSet()382   CompactSet()
383     : min_key_(NoKey),
384       max_key_(NoKey) { }
385 
CompactSet(const CompactSet<Key,NoKey> & compact_set)386   CompactSet(const CompactSet<Key, NoKey> &compact_set)
387     : set_(compact_set.set_),
388       min_key_(compact_set.min_key_),
389       max_key_(compact_set.max_key_) { }
390 
Insert(Key key)391   void Insert(Key key) {
392     set_.insert(key);
393     if (min_key_ == NoKey || key < min_key_)
394       min_key_ = key;
395     if (max_key_ == NoKey || max_key_ < key)
396         max_key_ = key;
397   }
398 
Erase(Key key)399   void Erase(Key key) {
400     set_.erase(key);
401     if (set_.empty()) {
402         min_key_ = max_key_ = NoKey;
403     } else if (key == min_key_) {
404       ++min_key_;
405     } else if (key == max_key_) {
406       --max_key_;
407     }
408   }
409 
Clear()410   void Clear() {
411     set_.clear();
412     min_key_ = max_key_ = NoKey;
413   }
414 
Find(Key key)415   const_iterator Find(Key key) const {
416     if (min_key_ == NoKey ||
417         key < min_key_ || max_key_ < key)
418       return set_.end();
419     else
420       return set_.find(key);
421   }
422 
Member(Key key)423   bool Member(Key key) const {
424     if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
425       return false;   // out of range
426     } else if (min_key_ != NoKey && max_key_ + 1 == min_key_ + set_.size()) {
427       return true;    // dense range
428     } else {
429       return set_.find(key) != set_.end();
430     }
431   }
432 
Begin()433   const_iterator Begin() const { return set_.begin(); }
434 
End()435   const_iterator End() const { return set_.end(); }
436 
437   // All stored keys are greater than or equal to this value.
LowerBound()438   Key LowerBound() const { return min_key_; }
439 
440   // All stored keys are less than or equal to this value.
UpperBound()441   Key UpperBound() const { return max_key_; }
442 
443 private:
444   set<Key> set_;
445   Key min_key_;
446   Key max_key_;
447 
448   void operator=(const CompactSet<Key, NoKey> &);  //disallow
449 };
450 
451 }  // namespace fst
452 
453 #endif  // FST_LIB_UTIL_H__
454