Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
ngram_test_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Simon King */
34 /* Date : July 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* EST_Ngrammar test program */
37 /* */
38 /*=======================================================================*/
39 #include "EST.h"
40 #include "EST_Ngrammar.h"
41 
42 
43 /** @name <command>ngram_test</command> <emphasis> Test n-gram language model </emphasis>
44  @id ngram_test_manual
45  * @toc
46  */
47 
48 //@{
49 
50 
51 /**@name Synopsis
52  */
53 //@{
54 
55 //@synopsis
56 
57 /**
58 ngram_test is for testing ngram models generated from
59 <link linkend=ngram-build-manual>ngram_build</link>.
60 
61 <formalpara> <para> <title> How do we test an ngram model ? </title>
62 </para>
63 
64 <para> ngram_test will compute the entropy (or perplexity, see below)
65 of some test data, given an ngram model. The entropy gives a measure
66 of how likely the ngram model is to have generated the test
67 data. Entropy is defined (for a sliding-window type ngram) as:
68 
69 \[H = -\frac{1}{Q} \sum_{i=1}^{Q} log P(w_i | w_{i-1}, w_{i-2},... w_{i-N+1}) \]
70 
71 where \(Q\) is the number of words of test data and \(N\) is the order
72 of the ngram model. Perplexity is a more intuitive mease, defined as:
73 
74 \[B = 2^H \]
75 
76 The perplexity of an ngram model with vocabulary size V will be
77 between 1 and V. Low perplexity indicates a more predictable language,
78 and in speech recognition, a models with low perplexity on test data
79 (i.e. data NOT used to estimate the model in the first place)
80 typically give better accuracy recognition than models with higher
81 perplexity (this is not guaranteed, however).
82 
83 test_ngram works with non-sliding-window type models when the input
84 format is <parameter>ngram_per_line</parameter>.
85 
86 </para>
87 </formalpara>
88 
89 <formalpara>
90 <para><title>Input data format</title></para>
91 <para> The data input format options are the same as
92 <link linkend=ngram-build-manual>ngram_build</link>, as is the treatment of sentence start/end using
93 special tags.
94 </para>
95 <para>
96 
97 Note: To get meaningful entropy/perplexity figures, it is recommended that
98 you use the same data input format in both
99 <link linkend=ngram-build-manual>ngram_build</link> and <link linkend=ngram-test-manual>ngram_test</link>, and the treatment of
100 sentence start/end should be the same.
101 </para>
102 </formalpara>
103 
104 
105 @see ngram_build */
106 //@}
107 
108 /**@name OPTIONS
109  */
110 //@{
111 
112 //@options
113 
114 //@}
115 
116 
117 int main(int argc, char **argv)
118 {
119  //int order;
120  EST_StrList files,script;
121  EST_Option al, op;
122  EST_String wordlist_file, script_file, in_file, format;
123  EST_String prev_tag, prev_prev_tag, last_tag;
124  EST_Litem *p;
125  //EST_Ngrammar::representation_t representation =
126  //EST_Ngrammar::dense;
127 
128  EST_StrList wordlist;
129  EST_Ngrammar ngrammar;
130  bool per_file_stats=false;
131  bool raw_stats=false;
132  bool brief=false;
133  EST_String input_format;
134 
135  double raw_entropy,count,entropy,perplexity,total_raw_H,total_count;
136  total_count = 0;
137  total_raw_H = 0;
138 
139  parse_command_line
140  (argc, argv,
141  EST_String("[input file0] [input file1] ...\n")+
142  "-g <ifile> grammar file (required)\n"+
143  "-w <ifile> filename containing word list (required for some grammar formats)\n"+
144  "-S <ifile> script file\n"+
145  "-raw_stats print unnormalised entropy and sample count\n"+
146  "-brief print results in brief format\n"+
147  "-f print stats for each file\n"+
148  "\n"+
149  "-input_format <string>\n"+
150  " format of input data (default sentence_per_line)\n"+
151  " may also be sentence_per_file, or ngram_per_line.\n"+
152  "\n"+
153  "Pseudo-words :\n"+
154  "-prev_tag <string>\n"+
155  " tag before sentence start\n"+
156  "-prev_prev_tag <string>\n"+
157  " all words before 'prev_tag'\n"+
158  "-last_tag <string>\n"+
159  " after sentence end\n"+
160  "-default_tags\n"+
161  " use default tags of "+SENTENCE_START_MARKER+
162  ","+SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
163  " respectively\n",
164  files, al);
165 
166 
167  if (al.present("-w"))
168  wordlist_file = al.val("-w");
169  else{
170  wordlist_file = "";
171  }
172 
173  if (al.present("-f"))
174  per_file_stats = true;
175  if (al.present("-input_format"))
176  input_format = al.val("-input_format");
177  else
178  input_format = "sentence_per_line";
179 
180  if ( al.present("-raw_stats") || al.present("-r"))
181  raw_stats = true;
182 
183  if ( al.present("-brief") || al.present("-b") )
184  brief = true;
185 
186 
187  if (al.present("-default_tags"))
188  {
189  prev_tag = SENTENCE_START_MARKER;
190  prev_prev_tag = SENTENCE_END_MARKER;
191  last_tag = SENTENCE_END_MARKER;
192  }
193 
194  if (al.present("-prev_tag"))
195  {
196  if (al.present("-default_tags"))
197  cerr << "test_ngram: WARNING : -prev_tag overrides -default_tags"
198  << endl;
199  prev_tag = al.val("-prev_tag");
200  }
201 
202  if (al.present("-prev_prev_tag"))
203  {
204  if (al.present("-default_tags"))
205  cerr << "test_ngram: WARNING : -prev_prev_tag overrides -default_tags" << endl;
206  prev_prev_tag = al.val("-prev_prev_tag");
207  }
208 
209  if (al.present("-last_tag"))
210  {
211  if (al.present("-default_tags"))
212  cerr << "test_ngram: WARNING : -last_tag overrides -default_tags" << endl;
213  last_tag = al.val("-last_tag");
214  }
215 
216  if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
217  && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
218  {
219  cerr << "test_ngram: ERROR : if any tags are given, ALL must be given" << endl;
220  exit(1);
221  }
222 
223 
224  // script
225  if (al.present("-S"))
226  {
227  script_file = al.val("-S");
228 
229  if(load_StrList(script_file,script) != format_ok)
230  {
231  cerr << "test_ngram: Could not read script from file "
232  << script_file << endl;
233  exit(1);
234  }
235  }
236 
237  if (al.present("-g"))
238  in_file = al.val("-g");
239  else
240  {
241  cerr << "test_ngram: Must give a grammar filename using -g" << endl;
242  exit(1);
243  }
244 
245  // plus any files on command line
246  // except file "-" unless there is no script
247  if(script.head()==NULL)
248  script += files;
249  else
250  for(p=files.head();p!=0;p=p->next())
251  if(files(p) != "-")
252  script.append(files(p));
253 
254  if(script.head() == NULL)
255  {
256  cerr << "test_ngram: No test files given" << endl;
257  exit(1);
258  }
259 
260  if (wordlist_file != "")
261  {
262  // load wordlist
263  if (load_StrList(wordlist_file,wordlist) != format_ok)
264  {
265  cerr << "test_ngram: Could not read wordlist from file " << wordlist_file
266  << endl;
267  exit(1);
268  }
269 
270  // load grammar using wordlist
271  if (ngrammar.load(in_file,wordlist) != format_ok)
272  {
273  cerr << "test_ngram: Failed to load grammar" << endl;
274  exit(1);
275  }
276  }
277  else
278  {
279  if (ngrammar.load(in_file) != format_ok)
280  {
281  cerr << "test_ngram: Failed to load grammar" << endl;
282  exit(1);
283  }
284  }
285 
286  if (!brief)
287  {
288  cout << "Ngram Test Results" << endl;
289  cout << "==================" << endl;
290  }
291 
292  for (p = script.head(); p; p = p->next())
293  {
294  // test each file
295  if (test_stats(ngrammar,
296  script(p),
297  raw_entropy,count,
298  entropy,perplexity,
299  input_format,
300  prev_tag,
301  prev_prev_tag))
302  {
303  total_raw_H += raw_entropy;
304  total_count += count;
305 
306  if(per_file_stats)
307  {
308  if (brief)
309  cout << basename(script(p)) << " \t";
310  else
311  cout << script(p) << endl;
312 
313  if(raw_stats)
314  {
315  if (brief)
316  cout << raw_entropy << " " << count << " ";
317  else
318  {
319  cout << " raw entropy " << raw_entropy << endl;
320  cout << " count " << count << endl;
321  }
322  }
323 
324  if (brief)
325  cout << entropy << " " << perplexity << endl;
326  else
327  {
328  cout << " entropy " << entropy << endl;
329  cout << " perplexity " << perplexity << endl << endl;
330  }
331  }
332  }
333  else
334  {
335  cerr << "test_ngram: WARNING : file '" << script(p)
336  << "' could not be processed" << endl;
337  }
338 
339  }
340  if (total_count > 0)
341  {
342  if (!brief)
343  cout << "Summary for grammar " << in_file << endl;
344  else
345  if (per_file_stats)
346  cout << "summary \t";
347 
348  if(raw_stats)
349  {
350  if (brief)
351  cout << total_raw_H << " " << total_count << " ";
352  else
353  {
354  cout << " raw entropy " << total_raw_H << endl;
355  cout << " count " << total_count << endl;
356  }
357  }
358  if (brief)
359  {
360  cout << total_raw_H / total_count;
361  cout << " " << pow(2.0,total_raw_H / total_count);
362  cout << endl;
363  }
364  else
365  {
366  cout << " entropy " << total_raw_H / total_count << endl;
367  cout << " perplexity " << pow(2.0,total_raw_H / total_count);
368  cout << endl;
369  }
370  }
371  else
372  {
373  cerr << "test_ngram: No data processed" << endl;
374  }
375 
376  // everything went okay
377  return 0;
378 }
379 
380 
381 void override_lib_ops(EST_Option &a_list, EST_Option &al)
382 {
383  (void)a_list;
384  (void)al;
385 }
386 
387 /** @name Hints
388 
389 <title>I got a perplexity of Infinity - what went wrong ?</title>
390 
391 A perplexity of Infinity means that at least one of the ngrams in your
392 test data had a probability of zero. Possible reasons for this include:
393 
394 <itemizedlist>
395 
396 <listitem><para>The training data had no examples of this ngram, and
397 you did not specify a floor for zero frequency ngrams in
398 \Ref{build_ngram} </para></listitem>
399 <listitem><para>You used differing input formats for \Ref{ngram_build}
400 and \Ref{ngram_test}. </para></listitem>
401 <listitem><para>You used differing sentence start/end treatments in
402 \Ref{ngram_build} and \Ref{ngram_test}. </para></listitem>
403 </itemizedlist>
404 
405 */
406 
407  //@{
408  //@}
409 
410 //@}