1
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 // Copyright 2005-2010 Google, Inc.
15 // Authors: allauzen@google.com (Cyril Allauzen)
16 // ttai@google.com (Terry Tai)
17 // jpr@google.com (Jake Ratkiewicz)
18
19
20 #ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
21 #define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
22
23 #include <libgen.h>
24 #include <string>
25 #include <vector>
26 using std::vector;
27
28 #include <fst/extensions/far/far.h>
29 #include <fst/string.h>
30
31 namespace fst {
32
33 // Construct a reader that provides FSTs from a file (stream) either on a
34 // line-by-line basis or on a per-stream basis. Note that the freshly
35 // constructed reader is already set to the first input.
36 //
37 // Sample Usage:
38 // for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) {
39 // Fst *fst = reader.GetVectorFst();
40 // }
41 template <class A>
42 class StringReader {
43 public:
44 typedef A Arc;
45 typedef typename A::Label Label;
46 typedef typename A::Weight Weight;
47 typedef typename StringCompiler<A>::TokenType TokenType;
48
49 enum EntryType { LINE = 1, FILE = 2 };
50
51 StringReader(istream &istrm,
52 const string &source,
53 EntryType entry_type,
54 TokenType token_type,
55 bool allow_negative_labels,
56 const SymbolTable *syms = 0,
57 Label unknown_label = kNoStateId)
58 : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type),
59 token_type_(token_type), symbols_(syms), done_(false),
60 compiler_(token_type, syms, unknown_label, allow_negative_labels) {
61 Next(); // Initialize the reader to the first input.
62 }
63
Done()64 bool Done() {
65 return done_;
66 }
67
Next()68 void Next() {
69 VLOG(1) << "Processing source " << source_ << " at line " << nline_;
70 if (!strm_) { // We're done if we have no more input.
71 done_ = true;
72 return;
73 }
74 if (entry_type_ == LINE) {
75 getline(strm_, content_);
76 ++nline_;
77 } else {
78 content_.clear();
79 string line;
80 while (getline(strm_, line)) {
81 ++nline_;
82 content_.append(line);
83 content_.append("\n");
84 }
85 }
86 if (!strm_ && content_.empty()) // We're also done if we read off all the
87 done_ = true; // whitespace at the end of a file.
88 }
89
90 VectorFst<A> *GetVectorFst(bool keep_symbols = false) {
91 VectorFst<A> *fst = new VectorFst<A>;
92 if (keep_symbols) {
93 fst->SetInputSymbols(symbols_);
94 fst->SetOutputSymbols(symbols_);
95 }
96 if (compiler_(content_, fst)) {
97 return fst;
98 } else {
99 delete fst;
100 return NULL;
101 }
102 }
103
104 CompactFst<A, StringCompactor<A> > *GetCompactFst(bool keep_symbols = false) {
105 CompactFst<A, StringCompactor<A> > *fst;
106 if (keep_symbols) {
107 VectorFst<A> tmp;
108 tmp.SetInputSymbols(symbols_);
109 tmp.SetOutputSymbols(symbols_);
110 fst = new CompactFst<A, StringCompactor<A> >(tmp);
111 } else {
112 fst = new CompactFst<A, StringCompactor<A> >;
113 }
114 if (compiler_(content_, fst)) {
115 return fst;
116 } else {
117 delete fst;
118 return NULL;
119 }
120 }
121
122 private:
123 size_t nline_;
124 istream &strm_;
125 string source_;
126 EntryType entry_type_;
127 TokenType token_type_;
128 const SymbolTable *symbols_;
129 bool done_;
130 StringCompiler<A> compiler_;
131 string content_; // The actual content of the input stream's next FST.
132
133 DISALLOW_COPY_AND_ASSIGN(StringReader);
134 };
135
136 // Compute the minimal length required to encode each line number as a decimal
137 // number.
138 int KeySize(const char *filename);
139
140 template <class Arc>
FarCompileStrings(const vector<string> & in_fnames,const string & out_fname,const string & fst_type,const FarType & far_type,int32 generate_keys,FarEntryType fet,FarTokenType tt,const string & symbols_fname,const string & unknown_symbol,bool keep_symbols,bool initial_symbols,bool allow_negative_labels,bool file_list_input,const string & key_prefix,const string & key_suffix)141 void FarCompileStrings(const vector<string> &in_fnames,
142 const string &out_fname,
143 const string &fst_type,
144 const FarType &far_type,
145 int32 generate_keys,
146 FarEntryType fet,
147 FarTokenType tt,
148 const string &symbols_fname,
149 const string &unknown_symbol,
150 bool keep_symbols,
151 bool initial_symbols,
152 bool allow_negative_labels,
153 bool file_list_input,
154 const string &key_prefix,
155 const string &key_suffix) {
156 typename StringReader<Arc>::EntryType entry_type;
157 if (fet == FET_LINE) {
158 entry_type = StringReader<Arc>::LINE;
159 } else if (fet == FET_FILE) {
160 entry_type = StringReader<Arc>::FILE;
161 } else {
162 FSTERROR() << "FarCompileStrings: unknown entry type";
163 return;
164 }
165
166 typename StringCompiler<Arc>::TokenType token_type;
167 if (tt == FTT_SYMBOL) {
168 token_type = StringCompiler<Arc>::SYMBOL;
169 } else if (tt == FTT_BYTE) {
170 token_type = StringCompiler<Arc>::BYTE;
171 } else if (tt == FTT_UTF8) {
172 token_type = StringCompiler<Arc>::UTF8;
173 } else {
174 FSTERROR() << "FarCompileStrings: unknown token type";
175 return;
176 }
177
178 bool compact;
179 if (fst_type.empty() || (fst_type == "vector")) {
180 compact = false;
181 } else if (fst_type == "compact") {
182 compact = true;
183 } else {
184 FSTERROR() << "FarCompileStrings: unknown fst type: "
185 << fst_type;
186 return;
187 }
188
189 const SymbolTable *syms = 0;
190 typename Arc::Label unknown_label = kNoLabel;
191 if (!symbols_fname.empty()) {
192 SymbolTableTextOptions opts;
193 opts.allow_negative = allow_negative_labels;
194 syms = SymbolTable::ReadText(symbols_fname, opts);
195 if (!syms) {
196 FSTERROR() << "FarCompileStrings: error reading symbol table: "
197 << symbols_fname;
198 return;
199 }
200 if (!unknown_symbol.empty()) {
201 unknown_label = syms->Find(unknown_symbol);
202 if (unknown_label == kNoLabel) {
203 FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label
204 << "\" missing from symbol table: " << symbols_fname;
205 return;
206 }
207 }
208 }
209
210 FarWriter<Arc> *far_writer =
211 FarWriter<Arc>::Create(out_fname, far_type);
212 if (!far_writer) return;
213
214 vector<string> inputs;
215 if (file_list_input) {
216 for (int i = 1; i < in_fnames.size(); ++i) {
217 istream *istrm = in_fnames.empty() ? &cin :
218 new ifstream(in_fnames[i].c_str());
219 string str;
220 while (getline(*istrm, str))
221 inputs.push_back(str);
222 if (!in_fnames.empty())
223 delete istrm;
224 }
225 } else {
226 inputs = in_fnames;
227 }
228
229 for (int i = 0, n = 0; i < inputs.size(); ++i) {
230 if (generate_keys == 0 && inputs[i].empty()) {
231 FSTERROR() << "FarCompileStrings: read from a file instead of stdin or"
232 << " set the --generate_keys flags.";
233 delete far_writer;
234 delete syms;
235 return;
236 }
237 int key_size = generate_keys ? generate_keys :
238 (entry_type == StringReader<Arc>::FILE ? 1 :
239 KeySize(inputs[i].c_str()));
240 istream *istrm = inputs[i].empty() ? &cin :
241 new ifstream(inputs[i].c_str());
242
243 bool keep_syms = keep_symbols;
244 for (StringReader<Arc> reader(
245 *istrm, inputs[i].empty() ? "stdin" : inputs[i],
246 entry_type, token_type, allow_negative_labels,
247 syms, unknown_label);
248 !reader.Done();
249 reader.Next()) {
250 ++n;
251 const Fst<Arc> *fst;
252 if (compact)
253 fst = reader.GetCompactFst(keep_syms);
254 else
255 fst = reader.GetVectorFst(keep_syms);
256 if (initial_symbols)
257 keep_syms = false;
258 if (!fst) {
259 FSTERROR() << "FarCompileStrings: compiling string number " << n
260 << " in file " << inputs[i] << " failed with token_type = "
261 << (tt == FTT_BYTE ? "byte" :
262 (tt == FTT_UTF8 ? "utf8" :
263 (tt == FTT_SYMBOL ? "symbol" : "unknown")))
264 << " and entry_type = "
265 << (fet == FET_LINE ? "line" :
266 (fet == FET_FILE ? "file" : "unknown"));
267 delete far_writer;
268 delete syms;
269 if (!inputs[i].empty()) delete istrm;
270 return;
271 }
272 ostringstream keybuf;
273 keybuf.width(key_size);
274 keybuf.fill('0');
275 keybuf << n;
276 string key;
277 if (generate_keys > 0) {
278 key = keybuf.str();
279 } else {
280 char* filename = new char[inputs[i].size() + 1];
281 strcpy(filename, inputs[i].c_str());
282 key = basename(filename);
283 if (entry_type != StringReader<Arc>::FILE) {
284 key += "-";
285 key += keybuf.str();
286 }
287 delete[] filename;
288 }
289 far_writer->Add(key_prefix + key + key_suffix, *fst);
290 delete fst;
291 }
292 if (generate_keys == 0)
293 n = 0;
294 if (!inputs[i].empty())
295 delete istrm;
296 }
297
298 delete far_writer;
299 }
300
301 } // namespace fst
302
303
304 #endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
305