Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
ngram_build_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Simon King */
34 /* Date : July 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* EST_Ngrammar build program */
37 /* */
38 /*=======================================================================*/
39 
40 #include <cstdlib>
41 #include "EST.h"
42 #include "EST_Ngrammar.h"
43 #include "EST_Pathname.h"
44 
45 
46 
47 /** @name <command>ngram_build</command> <emphasis>Train n-gram language model</emphasis>
48  @id ngram_build_manual
49  * @toc
50  */
51 
52 //@{
53 
54 
55 /**@name Synopsis
56  */
57 //@{
58 
59 //@synopsis
60 
61 /**
62 ngram_build offers basic ngram language model estimation.
63 
64 
65 <formalpara>
66 <para><title>Input data format</title></para>
67 
68 <para> Two input formats are supported. In sentence_per_line format,
69 the program will deal with start and end of sentence (if required) by
70 using special vocabulary items specified by -prev_tag, -prev_prev_tag
71 and -last_tag. For example, the input sentence: </para>
72 
73 <screen>
74 the cat sat on the mat
75 </screen>
76 
77 would be treated as
78 
79 <screen>
80 ... prev_prev_tag prev_prev_tag prev_tag the cat sat on the mat last_tag
81 </screen>
82 
83 where prev_prev_tag is the argument to -prev_prev_tag, and so on. A
84 default set of tag names is also available. This input format is only
85 useful for sliding-window type applications (e.g. language modelling
86 for speech recognition).
87 
88 The second input format is ngram_per_line which is useful for either
89 non-sliding-window applications, or where the user requires an
90 alternative treatment of start/end of sentence to that provided
91 above. Now the input file simply contains a complete ngram per
92 line. For the same example as above (to build a trigram model) this
93 would be:
94 
95 <para>
96 <screen>
97 prev_prev_tag prev_tag the
98 prev_tag the cat
99 the cat sat
100 cat sat on
101 sat on the
102 on the mat
103 the mat last_tag
104 </screen>
105 </para>
106 
107 </formalpara>
108 
109 
110 <formalpara>
111 <para><title>Representation</title></para>
112 
113 \[V^N\]
114 
115 <para> The internal representation of the model becomes important for
116 higher values of N where, if V is the vocabulary size, \(V^N\) becomes
117 very large. In such cases, we cannot explicitly hold probabilities for
118 all possible ngrams, and a sparse representation must be used
119 (i.e. only non-zero probabilities are stored).</para>
120 </formalpara>
121 
122 <formalpara>
123 <para><title>Getting more robust probability estimates</title></para>
124 The common techniques for getting better estimates of the low/zero
125 frequency ngrams are provided: namely smoothing and backing-off</para>
126 </formalpara>
127 
128 <formalpara>
129 <para><title>Testing an ngram model</title></para>
130 Use the <link linkend=ngram-test-manual>ngram_test</link> program.
131 </formalpara>
132 
133 */
134 
135 //@}
136 
137 /**@name OPTIONS
138  */
139 //@{
140 
141 //@options
142 
143 //@}
144 
145 
146 int main(int argc, char **argv)
147 {
148  int order;
149  EST_StrList files;
150  EST_Option al, op;
151  EST_String wordlist_file,wordlist_file2, out_file, format;
152  EST_String prev_tag(""), prev_prev_tag(""), last_tag("");
153  EST_String input_format(""), oov_mode(""), oov_marker("");
154  EST_Ngrammar::representation_t representation =
155  EST_Ngrammar::dense;
156 
157  EST_StrList wordlist,wordlist2;
158  EST_Ngrammar ngrammar;
159  bool trace=false;
160  double floor=0.0;
161 
162  parse_command_line
163  (argc, argv,
164  EST_String("[input file0] [input file1] ... -o [output file]\n")+
165  "-w <ifile> filename containing word list (required)\n"+
166  "-p <ifile> filename containing predictee word list\n"+
167  " (default is to use wordlist given by -w)\n"+
168  "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
169  "-smooth <int> Good-Turing smooth the grammar up to the\n"+
170  " given frequency\n"+
171  "-o <ofile> Output file for constructed ngram\n"+
172  "\n"
173  "-input_format <string>\n"+
174  " format of input data (default sentence_per_line)\n"+
175  " may be sentence_per_file, ngram_per_line.\n"+
176  "-otype <string> format of output file, one of cstr_ascii\n"+
177  " cstr_bin or htk_ascii\n"+
178  "-sparse build ngram in sparse representation\n"+
179  "-dense build ngram in dense representation (default)\n"+
180  "-backoff <int>\n"+
181  " build backoff ngram (requires -smooth)\n"+
182  "-floor <double>\n"+
183  " frequency floor value used with some ngrams\n"+
184  "-freqsmooth <int>\n"+
185  " build frequency backed off smoothed ngram, this\n"+
186  " requires -smooth option\n"+
187  "-trace give verbose outout about build process\n"+
188  "-save_compressed save ngram in gzipped format\n"+
189  "-oov_mode <string>\n"+
190  " what to do about out-of-vocabulary words,\n"+
191  " one of skip_ngram, skip_sentence (default),\n"+
192  " skip_file, or use_oov_marker\n"+
193  "-oov_marker <string>\n"+
194  " special word for oov words (default "+OOV_MARKER+")\n"+
195  " (use in conjunction with '-oov_mode use_oov_marker'\n"+
196  "\n"+
197  "Pseudo-words :\n"+
198  "-prev_tag <string>\n"+
199  " tag before sentence start\n"+
200  "-prev_prev_tag <string>\n"+
201  " all words before 'prev_tag'\n"+
202  "-last_tag <string>\n"+
203  " after sentence end\n"+
204  "-default_tags use default tags of "+SENTENCE_START_MARKER+
205  ","+SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
206  " respectively\n",
207  files, al);
208 
209  if (al.present("-input_format"))
210  input_format = al.val("-input_format");
211  else
212  input_format = "sentence_per_line";
213 
214  if (al.present("-oov_mode"))
215  oov_mode = al.val("-oov_mode");
216  else
217  oov_mode = "skip_sentence";
218 
219 
220  if(al.present("-oov_marker"))
221  {
222  if(oov_mode != "use_oov_marker")
223  {
224  cerr << "Error : can only use -oov_marker with '-oov_mode use_oov_marker'" << endl;
225  exit(1);
226  }
227  else
228  oov_marker = al.val("-oov_marker");
229 
230  // should check oov marker is/isn't (?) in vocab
231  // ......
232  }
233 
234  if( (oov_mode != "skip_ngram") &&
235  (oov_mode != "skip_sentence") &&
236  (oov_mode != "skip_file") &&
237  (oov_mode != "use_oov_marker") )
238  {
239  cerr << oov_mode << " is not a valid oov_mode !" << endl;
240  exit(1);
241  }
242 
243  if (al.present("-w"))
244  wordlist_file = al.val("-w");
245  else{
246  cerr << "build_ngram: Must specify a wordlist with -w" << endl;
247  exit(1);
248  }
249 
250  if (load_StrList(wordlist_file,wordlist) != format_ok)
251  {
252  cerr << "build_ngram: Could not read wordlist from file "
253  << wordlist_file << endl;
254  exit(1);
255  }
256 
257 
258  if (al.present("-p"))
259  {
260 
261  if(input_format != "ngram_per_line")
262  {
263  cerr << "Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" << endl;
264  exit(1);
265  }
266 
267  wordlist_file2 = al.val("-p");
268  if (load_StrList(wordlist_file2,wordlist2) != format_ok)
269  {
270  cerr << "build_ngram: Could not read predictee list from file "
271  << wordlist_file2 << endl;
272  exit(1);
273  }
274  }
275 
276  if (al.present("-trace"))
277  trace=true;
278 
279  if (al.present("-o"))
280  out_file = al.val("-o");
281  else
282  out_file = "-";
283 
284  if (al.present("-default_tags"))
285  {
286  prev_tag = SENTENCE_START_MARKER;
287  prev_prev_tag = SENTENCE_END_MARKER;
288  last_tag = SENTENCE_END_MARKER;
289 
290  wordlist.append(SENTENCE_START_MARKER);
291  wordlist.append(SENTENCE_END_MARKER);
292 
293  if (al.present("-p"))
294  {
295  wordlist2.append(SENTENCE_START_MARKER);
296  wordlist2.append(SENTENCE_END_MARKER);
297  }
298  }
299 
300  if (al.present("-prev_tag"))
301  {
302  if (al.present("-default_tags"))
303  cerr << "build_ngram: WARNING : -prev_tag overrides -default_tags"
304  << endl;
305  prev_tag = al.val("-prev_tag");
306  }
307 
308  if (al.present("-prev_prev_tag"))
309  {
310  if (al.present("-default_tags"))
311  cerr << "build_ngram: WARNING : -prev_prev_tag overrides -default_tags"
312  << endl;
313  prev_prev_tag = al.val("-prev_prev_tag");
314  }
315 
316  if (al.present("-last_tag"))
317  {
318  if (al.present("-default_tags"))
319  cerr << "build_ngram: WARNING : -last_tag overrides -default_tags"
320  << endl;
321  last_tag = al.val("-last_tag");
322  }
323 
324  if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
325  && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
326  {
327  cerr << "build_ngram: ERROR : if any tags are given, ALL must be given"
328  << endl;
329  exit(1);
330  }
331 
332  if (al.present("-order"))
333  order = al.ival("-order");
334  else
335  {
336  cerr << "build_ngram: WARNING : No order specified with -order : defaulting to bigram"
337  << endl;
338  order = 2;
339  }
340 
341  if (al.present("-otype"))
342  format = al.val("-otype");
343  else
344  format = "";
345 
346  if (al.present("-floor"))
347  floor = al.dval("-floor");
348  else
349  floor = 0.0;
350 
351  if (al.present("-backoff"))
352  if (!al.present("-smooth"))
353  {
354  cerr << "build_ngram: backoff requires smooth value" << endl;
355  exit(-1);
356  }
357  if (al.present("-freqsmooth"))
358  if (!al.present("-smooth"))
359  {
360  cerr << "build_ngram: frequency smooth requires smooth value"
361  << endl;
362  exit(-1);
363  }
364 
365  if (al.present("-dense"))
366  representation = EST_Ngrammar::dense;
367  else if (al.present("-sparse"))
368  {
369  cerr << "build_ngram: Sorry, sparse representation is not yet available " << endl;
370  exit(1);
371  representation = EST_Ngrammar::sparse;
372  }
373  else if (al.present("-backoff"))
374  representation = EST_Ngrammar::backoff;
375  else
376  cerr << "build_ngram: Defaulting to dense representation" << endl;
377 
378  if (al.present("-p"))
379  {
380  if (!ngrammar.init(order,representation,wordlist,wordlist2))
381  {
382  cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
383  exit(1);
384  }
385  }
386  else
387  {
388  if (!ngrammar.init(order,representation,wordlist))
389  {
390  cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
391  exit(1);
392  }
393  }
394 
395 
396  if ( al.present("-backoff") )
397  {
398  if (!ngrammar.build(files,prev_tag,prev_prev_tag,
399  last_tag,input_format,oov_mode,
400  al.ival("-backoff"),al.ival("-smooth")))
401  {
402  cerr << "build_ngram: Failed to build backoff " << order
403  << "-gram" << endl;
404  exit(1);
405  }
406  else if (trace)
407  cerr << "build_ngram: Built backoff " << order <<
408  "-gram" << endl;
409  }
410  else
411  {
412  if (!ngrammar.build(files,prev_tag,prev_prev_tag,
413  last_tag,input_format,oov_mode))
414  {
415  cerr << "build_ngram: Failed to build " << order << "-gram" << endl;
416  exit(1);
417  }
418  else
419  if(trace)
420  cerr << "build_ngram: Built " << order << "-gram" << endl;
421  }
422 
423 
424  // Posit processing functions
425  if (al.present("-freqsmooth"))
426  {
427  Ngram_freqsmooth(ngrammar,al.ival("-smooth"),al.ival("-freqsmooth"));
428  }
429  else if (al.present("-smooth") && !al.present("-backoff"))
430  {
431  int smoothcount = atoi(al.val("-smooth"));
432  if(!Good_Turing_smooth(ngrammar,smoothcount,0))
433  {
434  cerr << "build_ngram: Failed to smooth " << order << "-gram" << endl;
435  exit(1);
436  }
437  else
438  if(trace)
439  cerr << "build_ngram: Good Turing smoothed " << order << "-gram" << endl;
440 
441  }
442 
443  // save
444  if (al.present("-save_compressed"))
445  {
446  EST_String tmp_file = make_tmp_filename();
447  if (ngrammar.save(tmp_file,format,trace,floor) == write_ok)
448  {
449  EST_String prog_name;
450  EST_Pathname tmp(out_file);
451  if (tmp.extension() == GZIP_FILENAME_EXTENSION)
452  prog_name = "gzip --stdout";
453  else if (tmp.extension() == COMPRESS_FILENAME_EXTENSION)
454  prog_name = "compress -c";
455  else // default
456  {
457  prog_name = "gzip --stdout";
458  if(out_file != "-")
459  out_file = out_file + "." + GZIP_FILENAME_EXTENSION;
460  }
461 
462  if (trace)
463  cerr << "build_ngram: Compressing with '" << prog_name << "'" << endl;
464 
465  // now compress
466  if(compress_file(tmp_file,out_file,prog_name) != 0)
467  {
468  cerr << "build_ngram: Failed to compress to file "
469  << out_file << endl;
470  (void)delete_file(tmp_file);
471  exit(1);
472  }
473 
474  (void)delete_file(tmp_file);
475 
476  if(trace)
477  cerr << "build_ngram: Saved in compressed " << format
478  << " format to " << out_file << endl;
479  }
480  else
481  {
482  cerr << "build_ngram: Failed to write temporary file "
483  << tmp_file << endl;
484  exit(1);
485  }
486 
487 
488  }
489  else
490  {
491  if (ngrammar.save(out_file,format,trace,floor) == write_ok)
492  {
493  if(trace)
494  cerr << "build_ngram: Saved in " << format
495  << " format to " << out_file << endl;
496  }
497  else
498  {
499  cerr << "build_ngram: Failed to save " << format << " format data to "
500  << out_file << endl;
501  exit(1);
502  }
503  }
504 
505 
506  // everything went okay
507  return 0;
508 }