Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_Ngrammar.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A general class for ngrams (bi-gram, tri-gram etc) */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_NGRAMMAR_H__
41 #define __EST_NGRAMMAR_H__
42 
43 #include <cstdarg>
44 #include <cstdlib>
45 
46 using namespace std;
47 
48 #include "EST_String.h"
49 #include "EST_Val.h"
50 #include "EST_rw_status.h"
51 #include "EST_types.h"
52 #include "EST_FMatrix.h"
53 #include "EST_TList.h"
54 #include "EST_StringTrie.h"
55 #include "EST_simplestats.h"
56 #include "EST_PST.h"
57 #include "EST_string_aux.h"
58 #include "EST_math.h"
59 
60 // HTK style
61 #define SENTENCE_START_MARKER "!ENTER"
62 #define SENTENCE_END_MARKER "!EXIT"
63 #define OOV_MARKER "!OOV"
64 
65 #define EST_NGRAMBIN_MAGIC 1315402337
66 
67 // for compressed save/load
68 #define GZIP_FILENAME_EXTENSION "gz"
69 #define COMPRESS_FILENAME_EXTENSION "Z"
70 
71 // Ultimate floor
72 #define TINY_FREQ 1.0e-10
73 
74 // ngram state - represents the N-1 word history and contains
75 // the pdf of the next word
76 
78 
79 private:
80 
81 protected:
83  int p_id; // a 'name'
84 
85 public:
87 
88  p_pdf()
89 
90  {
91  init();
92  };
93  EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);};
95  {clear();init(id,pdf);};
97  EST_NgrammarState(const EST_NgrammarState *const s);
99 
100  EST_IVector path; // how we got here
101 
102  // initialise
103  void clear();
104  void init();
105  void init(int id, EST_Discrete *d);
106  void init(int id, const EST_DiscreteProbDistribution &pdf);
107 
108  // build
109  void cumulate(const int index, const double count=1)
110  {p_pdf.cumulate(index,count);};
111  void cumulate(const EST_String &word, const double count=1)
112  {p_pdf.cumulate(word,count);};
113 
114  // access
115  int id() const {return p_id; };
116  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
117  EST_DiscreteProbDistribution &pdf() {return p_pdf; };
118  double probability(const EST_String &w) const
119  {return p_pdf.probability(w);}
120  double probability(int w) const {return p_pdf.probability(w);}
121  double frequency(const EST_String &w) const
122  {return p_pdf.frequency(w);}
123  double frequency(int w) const {return p_pdf.frequency(w);}
124  const EST_String &most_probable(double *prob = NULL) const
125  {return p_pdf.most_probable(prob);}
126 
127 friend ostream& operator<<(ostream& s, const EST_NgrammarState &a);
128 
129 };
130 
132 
133 private:
134 
135 protected:
136  int p_level; // = 0 for root node
137  double backoff_weight;
139  EST_StringTrie children;
140 
141  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
142  const EST_StrVector &words);
143  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
144  const EST_IVector &words);
145 public:
147  { init(); };
148  EST_BackoffNgrammarState(const EST_Discrete *d,int level)
149  {clear();init(d,level);};
151  {clear();init(pdf,level);};
155 
156  // initialise
157  void clear();
158  void init();
159  void init(const EST_Discrete *d, int level);
160  void init(const EST_DiscreteProbDistribution &pdf, int level);
161 
162  // build
163  bool accumulate(const EST_StrVector &words,
164  const double count=1);
165  bool accumulate(const EST_IVector &words,
166  const double count=1);
167  // access
168  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
169  EST_DiscreteProbDistribution &pdf() {return p_pdf; };
170  double probability(const EST_String &w) const
171  {return p_pdf.probability(w);}
172  double frequency(const EST_String &w) const
173  {return p_pdf.frequency(w);}
174  const EST_String &most_probable(double *prob = NULL) const
175  {return p_pdf.most_probable(prob);}
176 
177  const int level() const {return p_level;}
178 
179  EST_BackoffNgrammarState* get_child(const EST_String &word) const
180  {
181  return (EST_BackoffNgrammarState*)children.lookup(word);
182  }
183  EST_BackoffNgrammarState* get_child(const int word) const
184  {
185  return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));
186  }
187 
188  void remove_child(EST_BackoffNgrammarState *child,
189  const EST_String &name);
190 
191  // recursive delete of contents and children
192  void zap();
193 
194  const EST_BackoffNgrammarState *const get_state(const EST_StrVector &words) const;
195 
196  bool ngram_exists(const EST_StrVector &words,
197  const double threshold) const;
198  const double get_backoff_weight() const {return backoff_weight; }
199  const double get_backoff_weight(const EST_StrVector &words) const;
200  bool set_backoff_weight(const EST_StrVector &words, const double w);
201  void frequency_of_frequencies(EST_DVector &ff);
202 
203  void print_freqs(ostream &os,const int order,EST_String followers="");
204 
205 friend ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a);
206 
207 };
208 
210 
211 public:
212 
213  // 3 representations : sparse, dense and backed off. User specifies which.
214  enum representation_t {sparse, dense, backoff};
215 
216  // now only keep frequencies (or log frequencies)
217  // probabilities (or log probabilities) can be done
218  // on the fly quickly enough
219  enum entry_t {frequencies, log_frequencies};
220 
221 protected:
222 
223  // each instance of an EST_Ngrammar is a grammar of fixed order
224  // e.g. a bigram (order = 2)
225  int p_order;
226  int p_num_samples;
227 
228  double p_number_of_sentences; // which were used to build this grammar
229 
230 
231  EST_String p_sentence_start_marker;
232  EST_String p_sentence_end_marker;
233 
234  // only one representation in use at a time
235  representation_t p_representation;
236  entry_t p_entry_type;
237 
238  // sparse representation is a tree structure
239  // holding only those N-grams which were seen
240  EST_PredictionSuffixTree sparse_representation;
241  bool init_sparse_representation();
242 
243  // dense representation is just an array of all states
244  bool init_dense_representation();
245 
246  // backoff representation is also a tree structure
247  // but the root state pdf is the most recent word in the
248  // ngram and going down the tree is going back in time....
249  // here is the root node :
250  EST_BackoffNgrammarState *backoff_representation;
251 
252  double backoff_threshold;
253 
254  // need a non-zero unigram floor to enable backing off
255  double backoff_unigram_floor_freq;
256 
257  // instead of simple discounting, we have a (possibly) different
258  // discount per order and per frequency
259  // e.g. backoff_discount[2](4) contains the discount to be
260  // applied to a trigram frequency of 4
261  // backoff_discount[0] is unused (we don't discount unigrams)
262  EST_DVector *backoff_discount;
263  const double get_backoff_discount(const int order, const double freq) const;
264 
265  bool init_backoff_representation();
266  void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches
267  void backoff_restore_unigram_states();
268  int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense
269  EST_NgrammarState *p_states; // state id is index into this array
270  int find_dense_state_index(const EST_IVector &words, int index=0) const;
271 
272  // and the reverse
273  const EST_StrVector &make_ngram_from_index(const int i) const;
274 
275  // vocabulary
276  EST_Discrete *vocab;
277  EST_Discrete *pred_vocab; // may be different from state vocab
278  bool init_vocab(const EST_StrList &wordlist);
279  bool init_vocab(const EST_StrList &word_list,
280  const EST_StrList &pred_list);
281 
282  // make sure vocab matches a given wordlist
283  bool check_vocab(const EST_StrList &wordlist);
284 
285  EST_DiscreteProbDistribution vocab_pdf; // overall pdf
286 
287  const EST_String &lastword(const EST_StrVector &words) const
288  { return words(p_order-1); }
289  const int lastword(const EST_IVector &words) const
290  { return words(p_order-1); }
291  // are we allowing out-of-vocabulary words, or is the vocabulary closed?
292  bool allow_oov;
293 
294  bool sparse_to_dense();
295  bool dense_to_sparse();
296 
297  // these aren't sorted yet ...
298  void take_logs();
299  void take_exps();
300  void freqs_to_probs(); // just calls normalise
301 
302  bool build_sparse(const EST_String &filename,
303  const EST_String &prev,
304  const EST_String &prev_prev,
305  const EST_String &last);
306  // for dense and backoff
307  bool build_ngram(const EST_String &filename,
308  const EST_String &prev,
309  const EST_String &prev_prev,
310  const EST_String &last,
311  const EST_String &input_format);
312 
313  // go through all matching ngrams ( *(ngram[i])="" matches anything )
314  void iterate(EST_StrVector &words,
315  void (*function)(EST_Ngrammar *n,
316  EST_StrVector &words,
317  void *params),
318  void *params);
319 
320  // same, but with a constant Ngrammar
321  void const_iterate(EST_StrVector &words,
322  void (*function)(const EST_Ngrammar *const n,
323  EST_StrVector &words,
324  void *params),
325  void *params) const;
326 
327  bool p_init(int o, representation_t r);
328 
329  // new filename returned of we had to copy stdin to a
330  // temporary file - must delete it later !
331  bool oov_preprocess(const EST_String &filename,
332  EST_String &new_filename,
333  const EST_String &what);
334 
335 
336  const EST_NgrammarState &find_state_const(const EST_StrVector &words)const;
337  EST_NgrammarState &find_state(const EST_StrVector &words);
338  const EST_NgrammarState &find_state_const(const EST_IVector &words) const;
339  EST_NgrammarState &find_state(const EST_IVector &words);
340 
341  // special versions for backoff grammars
342  const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const;
343  const double backoff_reverse_probability_sub(const EST_StrVector &words,
344  const EST_BackoffNgrammarState *root) const;
345  const double backoff_probability(const EST_StrVector &words,
346  const bool trace=false) const;
347  const double backoff_reverse_probability(const EST_StrVector &words) const;
348  const EST_String & backoff_most_probable(const EST_StrVector &words,
349  double *prob = NULL) const;
350 
351  // backoff representation isn't a nice array of states
352  // so use this to visit every node in the tree
353  // and apply the function to that node
354  void backoff_traverse(EST_BackoffNgrammarState *start_state,
355  void (*function)(EST_BackoffNgrammarState *s,
356  void *params),
357  void *params);
358 
359  // visit every node at a given level
360  void backoff_traverse(EST_BackoffNgrammarState *start_state,
361  void (*function)(EST_BackoffNgrammarState *s,
362  void *params),
363  void *params, const int level);
364 public:
365 
366  EST_Ngrammar() {default_values();}
367 
368  EST_Ngrammar(int o, representation_t r,
369  const EST_StrList &wordlist)
370  {
371  default_values(); init(o,r,wordlist);
372  }
373 
374  // When state trans vocab differs from prediction vocab
375  EST_Ngrammar(int o, representation_t r,
376  const EST_StrList &wordlist,
377  const EST_StrList &predlist)
378  {
379  default_values(); init(o,r,wordlist,predlist);
380  }
381 
382  EST_Ngrammar(int o, representation_t r, EST_Discrete &v)
383  {
384  default_values(); init(o,r,v);
385  }
386  ~EST_Ngrammar();
387 
388  void default_values();
389  void clear();
390  bool init(int o, representation_t r,
391  const EST_StrList &wordlist);
392  bool init(int o, representation_t r,
393  const EST_StrList &wordlist,
394  const EST_StrList &predlist);
395  bool init(int o, representation_t r, EST_Discrete &v);
396  bool init(int o, representation_t r,
397  EST_Discrete &v,EST_Discrete &pv);
398 
399  // access
400  int num_states(void) const { return p_num_states;}
401  double samples(void) const { return p_num_samples;}
402  int order() const { return p_order; }
403  int get_vocab_length() const { return vocab?vocab->length():0; }
404  EST_String get_vocab_word(int i) const;
405  int get_vocab_word(const EST_String &s) const;
406  int get_pred_vocab_length() const { return pred_vocab->length(); }
407  EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); }
408  int get_pred_vocab_word(const EST_String &s) const
409  { return pred_vocab->name(s); }
410  int closed_vocab() const {return !allow_oov; }
411  entry_t entry_type() const {return p_entry_type;}
412  representation_t representation() const
413  { return p_representation;}
414 
415  // build
416  bool build(const EST_StrList &filenames,
417  const EST_String &prev = SENTENCE_START_MARKER,
418  const EST_String &prev_prev = SENTENCE_END_MARKER,
419  const EST_String &last = SENTENCE_END_MARKER,
420  const EST_String &input_format = "",
421  const EST_String &oov_mode = "",
422  const int mincount=1,
423  const int maxcount=10);
424 
425  // Accumulate ngrams
426  void accumulate(const EST_StrVector &words,
427  const double count=1);
428  //const int index=0);
429  void accumulate(const EST_IVector &words,
430  const double count=1);
431  //const int index=0);
432 
433  // hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x
434  void make_htk_compatible();
435 
436  // I/O functions
437  EST_read_status load(const EST_String &filename);
438  EST_read_status load(const EST_String &filename, const EST_StrList &wordlist);
439  EST_write_status save(const EST_String &filename,
440  const EST_String type="cstr_ascii",
441  const bool trace=false,
442  double floor=0.0);
443 
444  int wordlist_index(const EST_String &word, const bool report=true) const;
445  const EST_String &wordlist_index(int i) const;
446  int predlist_index(const EST_String &word) const;
447  const EST_String &predlist_index(int i) const;
448 
449  // set
450  bool set_entry_type(entry_t new_type);
451  bool set_representation(representation_t new_representation);
452 
453  // probability distributions
454  // -------------------------
455  // flag 'force' forces computation of probs on-the-fly if necessary
456  double probability(const EST_StrVector &words, bool force=false,
457  const bool trace=false) const;
458  double frequency(const EST_StrVector &words, bool force=false,
459  const bool trace=false) const;
460 
461  const EST_String &predict(const EST_StrVector &words,
462  double *prob,int *state) const;
463  const EST_String &predict(const EST_StrVector &words) const
464  {double p; int state; return predict(words,&p,&state); }
465  const EST_String &predict(const EST_StrVector &words,double *prob) const
466  {int state; return predict(words,prob,&state); }
467 
468  const EST_String &predict(const EST_IVector &words,double *prob,int *state) const;
469  const EST_String &predict(const EST_IVector &words) const
470  {double p; int state; return predict(words,&p,&state); }
471  const EST_String &predict(const EST_IVector &words,double *prob) const
472  {int state; return predict(words,prob,&state); }
473 
474  int find_state_id(const EST_StrVector &words) const;
475  int find_state_id(const EST_IVector &words) const;
476  int find_next_state_id(int state, int word) const;
477  // fast versions for common N
478  //const double probability(const EST_String w1);
479  //const double probability(const EST_String w1,const EST_String w2);
480  //const double probability(const EST_String w1,const EST_String w2,
481  //const EST_String w2);
482 
483  // reverse - probability of words[0..order-2] given word[order-1]
484  double reverse_probability(const EST_StrVector &words,
485  bool force=false) const;
486  double reverse_probability(const EST_IVector &words,
487  bool force=false) const;
488 
489  // predict, where words has 'order' elements and the last one is "" or NULL
490  const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const;
491  const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const;
492  const EST_DiscreteProbDistribution &prob_dist(int state) const;
493 
494 // bool stats(const EST_String filename,
495 // double &raw_entropy, double &count,
496 // double &entropy, double &perplexity,
497 // const EST_String &prev = SENTENCE_START_MARKER,
498 // const EST_String &prev_prev = SENTENCE_END_MARKER,
499 // const EST_String &last = SENTENCE_END_MARKER,
500 // const EST_String &input_format = "") const;
501 
502  void fill_window_start(EST_IVector &window,
503  const EST_String &prev,
504  const EST_String &prev_prev) const;
505 
506  void fill_window_start(EST_StrVector &window,
507  const EST_String &prev,
508  const EST_String &prev_prev) const;
509 
510  // why anybody would want to do this ....
511  //EST_Ngrammar &operator =(const EST_Ngrammar &a);
512 
513  bool ngram_exists(const EST_StrVector &words) const;
514  bool ngram_exists(const EST_StrVector &words, const double threshold) const;
515  const double get_backoff_weight(const EST_StrVector &words) const;
516  bool set_backoff_weight(const EST_StrVector &words, const double w);
517 
518  void print_freqs(ostream &os,double floor=0.0);
519 
520  // i/o functions
521  // -------------
522  friend ostream& operator<<(ostream& s, EST_Ngrammar &n);
523  friend EST_read_status load_ngram_htk_ascii(const EST_String filename,
524  EST_Ngrammar &n);
525  friend EST_read_status load_ngram_htk_binary(const EST_String filename,
526  EST_Ngrammar &n);
527  friend EST_read_status load_ngram_arpa(const EST_String filename,
528  EST_Ngrammar &n,
529  const EST_StrList &vocab);
530  friend EST_read_status load_ngram_cstr_ascii(const EST_String filename,
531  EST_Ngrammar &n);
532  friend EST_read_status load_ngram_cstr_bin(const EST_String filename,
533  EST_Ngrammar &n);
534 
535  friend EST_write_status save_ngram_htk_ascii_sub(const EST_String &word,
536  ostream *ost,
537  EST_Ngrammar &n,
538  double floor);
539  friend EST_write_status save_ngram_htk_ascii(const EST_String filename,
540  EST_Ngrammar &n,
541  double floor);
542 
543  //friend EST_write_status save_ngram_htk_binary(const EST_String filename,
544  // EST_Ngrammar &n);
545  friend EST_write_status save_ngram_cstr_ascii(const EST_String filename,
546  EST_Ngrammar &n,
547  const bool trace,
548  double floor);
549  friend EST_write_status save_ngram_cstr_bin(const EST_String filename,
550  EST_Ngrammar &n,
551  const bool trace,
552  double floor);
553  friend EST_write_status save_ngram_arpa(const EST_String filename,
554  EST_Ngrammar &n);
555  friend EST_write_status save_ngram_arpa_sub(ostream *ost,
556  EST_Ngrammar &n,
557  const EST_StrVector &words);
558  friend EST_write_status save_ngram_wfst(const EST_String filename,
559  EST_Ngrammar &n);
560 
561  // Auxiliary functions
562 
563  // smoothing
564 friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order);
565 friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order);
566 friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount);
567 friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
568  const double default_discount);
569 
570 friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
571  EST_Ngrammar &ngram);
572 friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
573  EST_Ngrammar &ngram, int smooth_thresh);
574 
575  // frequencies below mincount get backed off
576  // frequencies above maxcount are not smoothed(discounted)
577  bool compute_backoff_weights(const int mincount=1,
578  const int maxcount=10);
579 
580 
581  bool merge(EST_Ngrammar &n,float weight);
582 
583 friend class EST_BackoffNgrammar;
584 
585 };
586 
587 void Ngram_freqsmooth(EST_Ngrammar &ngram,
588  int smooth_thresh1,
589  int smooth_thresh2);
590 
591 // utils
592 void slide(EST_IVector &i, const int l);
593 void slide(EST_StrVector &i, const int l);
594 
595 bool test_stats(EST_Ngrammar &ngram,
596  const EST_String &filename,
597  double &raw_entropy,
598  double &count,
599  double &entropy,
600  double &perplexity,
601  const EST_String &input_format,
602  const EST_String &prev = SENTENCE_START_MARKER,
603  const EST_String &prev_prev = SENTENCE_END_MARKER,
604  const EST_String &last = SENTENCE_END_MARKER);
605 
606 VAL_REGISTER_CLASS_DCLS(ngrammar,EST_Ngrammar)
607 
608 #endif // __EST_NGRAMMAR_H__