1 // compile.h 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Copyright 2005-2010 Google, Inc. 16 // Author: riley@google.com (Michael Riley) 17 // 18 // \file 19 // Class to to compile a binary Fst from textual input. 20 21 #ifndef FST_SCRIPT_COMPILE_IMPL_H_ 22 #define FST_SCRIPT_COMPILE_IMPL_H_ 23 24 #include <tr1/unordered_map> 25 using std::tr1::unordered_map; 26 using std::tr1::unordered_multimap; 27 #include <sstream> 28 #include <string> 29 #include <vector> 30 using std::vector; 31 32 #include <iostream> 33 #include <fstream> 34 #include <sstream> 35 #include <fst/fst.h> 36 #include <fst/util.h> 37 #include <fst/vector-fst.h> 38 39 DECLARE_string(fst_field_separator); 40 41 namespace fst { 42 43 // Compile a binary Fst from textual input, helper class for fstcompile.cc 44 // WARNING: Stand-alone use of this class not recommended, most code should 45 // read/write using the binary format which is much more efficient. 46 template <class A> class FstCompiler { 47 public: 48 typedef A Arc; 49 typedef typename A::StateId StateId; 50 typedef typename A::Label Label; 51 typedef typename A::Weight Weight; 52 53 // WARNING: use of 'allow_negative_labels = true' not recommended; may 54 // cause conflicts 55 FstCompiler(istream &istrm, const string &source, 56 const SymbolTable *isyms, const SymbolTable *osyms, 57 const SymbolTable *ssyms, bool accep, bool ikeep, 58 bool okeep, bool nkeep, bool allow_negative_labels = false) 59 : nline_(0), source_(source), 60 isyms_(isyms), osyms_(osyms), ssyms_(ssyms), 61 nstates_(0), keep_state_numbering_(nkeep), 62 allow_negative_labels_(allow_negative_labels) { 63 char line[kLineLen]; 64 while (istrm.getline(line, kLineLen)) { 65 ++nline_; 66 vector<char *> col; 67 string separator = FLAGS_fst_field_separator + "\n"; 68 SplitToVector(line, separator.c_str(), &col, true); 69 if (col.size() == 0 || col[0][0] == '\0') // empty line 70 continue; 71 if (col.size() > 5 || 72 (col.size() > 4 && accep) || 73 (col.size() == 3 && !accep)) { 74 FSTERROR() << "FstCompiler: Bad number of columns, source = " 75 << source_ 76 << ", line = " << nline_; 77 fst_.SetProperties(kError, kError); 78 return; 79 } 80 StateId s = StrToStateId(col[0]); 81 while (s >= fst_.NumStates()) 82 fst_.AddState(); 83 if (nline_ == 1) 84 fst_.SetStart(s); 85 86 Arc arc; 87 StateId d = s; 88 switch (col.size()) { 89 case 1: 90 fst_.SetFinal(s, Weight::One()); 91 break; 92 case 2: 93 fst_.SetFinal(s, StrToWeight(col[1], true)); 94 break; 95 case 3: 96 arc.nextstate = d = StrToStateId(col[1]); 97 arc.ilabel = StrToILabel(col[2]); 98 arc.olabel = arc.ilabel; 99 arc.weight = Weight::One(); 100 fst_.AddArc(s, arc); 101 break; 102 case 4: 103 arc.nextstate = d = StrToStateId(col[1]); 104 arc.ilabel = StrToILabel(col[2]); 105 if (accep) { 106 arc.olabel = arc.ilabel; 107 arc.weight = StrToWeight(col[3], false); 108 } else { 109 arc.olabel = StrToOLabel(col[3]); 110 arc.weight = Weight::One(); 111 } 112 fst_.AddArc(s, arc); 113 break; 114 case 5: 115 arc.nextstate = d = StrToStateId(col[1]); 116 arc.ilabel = StrToILabel(col[2]); 117 arc.olabel = StrToOLabel(col[3]); 118 arc.weight = StrToWeight(col[4], false); 119 fst_.AddArc(s, arc); 120 } 121 while (d >= fst_.NumStates()) 122 fst_.AddState(); 123 } 124 if (ikeep) 125 fst_.SetInputSymbols(isyms); 126 if (okeep) 127 fst_.SetOutputSymbols(osyms); 128 } 129 Fst()130 const VectorFst<A> &Fst() const { 131 return fst_; 132 } 133 134 private: 135 // Maximum line length in text file. 136 static const int kLineLen = 8096; 137 138 int64 StrToId(const char *s, const SymbolTable *syms, 139 const char *name, bool allow_negative = false) const { 140 int64 n = 0; 141 142 if (syms) { 143 n = syms->Find(s); 144 if (n == -1 || (!allow_negative && n < 0)) { 145 FSTERROR() << "FstCompiler: Symbol \"" << s 146 << "\" is not mapped to any integer " << name 147 << ", symbol table = " << syms->Name() 148 << ", source = " << source_ << ", line = " << nline_; 149 fst_.SetProperties(kError, kError); 150 } 151 } else { 152 char *p; 153 n = strtoll(s, &p, 10); 154 if (p < s + strlen(s) || (!allow_negative && n < 0)) { 155 FSTERROR() << "FstCompiler: Bad " << name << " integer = \"" << s 156 << "\", source = " << source_ << ", line = " << nline_; 157 fst_.SetProperties(kError, kError); 158 } 159 } 160 return n; 161 } 162 StrToStateId(const char * s)163 StateId StrToStateId(const char *s) { 164 StateId n = StrToId(s, ssyms_, "state ID"); 165 166 if (keep_state_numbering_) 167 return n; 168 169 // remap state IDs to make dense set 170 typename unordered_map<StateId, StateId>::const_iterator it = states_.find(n); 171 if (it == states_.end()) { 172 states_[n] = nstates_; 173 return nstates_++; 174 } else { 175 return it->second; 176 } 177 } 178 StrToILabel(const char * s)179 StateId StrToILabel(const char *s) const { 180 return StrToId(s, isyms_, "arc ilabel", allow_negative_labels_); 181 } 182 StrToOLabel(const char * s)183 StateId StrToOLabel(const char *s) const { 184 return StrToId(s, osyms_, "arc olabel", allow_negative_labels_); 185 } 186 StrToWeight(const char * s,bool allow_zero)187 Weight StrToWeight(const char *s, bool allow_zero) const { 188 Weight w; 189 istringstream strm(s); 190 strm >> w; 191 if (!strm || (!allow_zero && w == Weight::Zero())) { 192 FSTERROR() << "FstCompiler: Bad weight = \"" << s 193 << "\", source = " << source_ << ", line = " << nline_; 194 fst_.SetProperties(kError, kError); 195 w = Weight::NoWeight(); 196 } 197 return w; 198 } 199 200 mutable VectorFst<A> fst_; 201 size_t nline_; 202 string source_; // text FST source name 203 const SymbolTable *isyms_; // ilabel symbol table 204 const SymbolTable *osyms_; // olabel symbol table 205 const SymbolTable *ssyms_; // slabel symbol table 206 unordered_map<StateId, StateId> states_; // state ID map 207 StateId nstates_; // number of seen states 208 bool keep_state_numbering_; 209 bool allow_negative_labels_; // not recommended; may cause conflicts 210 211 DISALLOW_COPY_AND_ASSIGN(FstCompiler); 212 }; 213 214 } // namespace fst 215 216 #endif // FST_SCRIPT_COMPILE_IMPL_H_ 217