Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_PST.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : July 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A general class for PredictionSuffixTrees */
38 /* */
39 /*=======================================================================*/
40 
41 #include <cstdlib>
42 #include <iostream>
43 #include <fstream>
44 #include <cstring>
45 #include "EST_String.h"
46 #include "EST_types.h"
47 #include "EST_Token.h"
48 #include "EST_PST.h"
49 #include "EST_multistats.h"
50 
51 VAL_REGISTER_CLASS(pstnode,EST_PredictionSuffixTree_tree_node)
52 
53 // Out of vocabulary identifier
54 const EST_String PredictionSuffixTree_oov("_OOV_");
55 const EST_DiscreteProbDistribution PSTnullProbDistribution;
56 
57 void slide(EST_IVector &i, const int l);
58 void slide(EST_StrVector &i, const int l);
59 
61 {
62 }
63 
64 void
65 EST_PredictionSuffixTree_tree_node::print_freqs(ostream &os)
66 {
67  // Print out path and frequency for each leaf
68 
69  if (p_level == 0)
70  {
71  // Base -- print from pd
72  EST_String s;
73  double freq;
74  EST_Litem *i;
75  for (i = pd.item_start();
76  !pd.item_end(i);
77  i=pd.item_next(i))
78  {
79  pd.item_freq(i,s,freq);
80  os << get_path() << " " << s << " : " << freq << endl;
81  }
82  }
83  else
84  {
86  for (t.begin(nodes); t; t++)
87  pstnode(t->v)->print_freqs(os);
88  }
89 }
90 
91 void
92 EST_PredictionSuffixTree_tree_node::print_probs(ostream &os)
93 {
94  // Print out path and probability distributions for node above leafs
95 
96  if (p_level == 0)
97  {
98  // Base -- print from pd
99  EST_String s;
100  double prob;
101  os << get_path() << " :";
102  for (EST_Litem *i = pd.item_start(); !pd.item_end(i) ; i=pd.item_next(i))
103  {
104  pd.item_prob(i,s,prob);
105  os << " " << s << " " << prob;
106  }
107  os << endl;
108  }
109  else
110  {
112  for (t.begin(nodes); t; t++)
113  pstnode(t->v)->print_probs(os);
114  }
115 }
116 
117 const EST_String &
118 EST_PredictionSuffixTree_tree_node::most_probable(double *prob) const
119 {
120  // Find most probable symbol in this node
121  return pd.most_probable(prob);
122 }
123 
124 EST_PredictionSuffixTree::EST_PredictionSuffixTree(void)
125 {
126  p_order = 0;
127  num_states = 0;
128  nodes = 0;
129  pd = 0;
130 }
131 
132 void
133 EST_PredictionSuffixTree::init(const int order)
134 {
135  p_order = order;
136  num_states = 0;
138  nodes->set_level(order-1);
140 }
141 
142 EST_PredictionSuffixTree::~EST_PredictionSuffixTree()
143 {
144  delete nodes;
145  delete pd;
146 }
147 
148 void
149 EST_PredictionSuffixTree::clear(void)
150 {
151  delete nodes;
152  delete pd;
153  pd = 0;
154 }
155 
157 EST_PredictionSuffixTree::p_prob_dist(EST_PredictionSuffixTree_tree_node *node,
158  const EST_StrVector &words,
159  const int index) const
160 {
161  // Return the probability distribution for this EST_PredictionSuffixTree context
162 
163  if (words.n() == index+1){
164  return node->prob_dist();
165  }else
166  {
169  next = pstnode(node->nodes.f(words(index),est_val(n)));
170  if (next == 0)
171  {
172  //cerr << "EST_PredictionSuffixTree: "
173  //<< "no EST_PredictionSuffixTree probabilities for context \n";
174  return PSTnullProbDistribution;
175  }
176  return p_prob_dist(next,words,index+1);
177  }
178 }
179 
180 
181 void
182 EST_PredictionSuffixTree::accumulate(const EST_StrVector &words,
183  const double count,
184  const int index)
185 {
186 
187 /*
188  cerr << "accumulate ";
189  for (int i=0; i < p_order-1; i++)
190  cerr << words(i+index) << " ";
191  cerr << count << endl;
192  */
193 
194  if (words.n()+index < p_order)
195  cerr << "EST_PredictionSuffixTree: accumlating window is wtoo small"
196  << endl;
197  else
198  {
199  pd->cumulate(words(p_order-1+index),count); // a little extra book-keeping
200  p_accumulate(nodes,words,count,index);
201  }
202 }
203 
204 void
205 EST_PredictionSuffixTree::p_accumulate(EST_PredictionSuffixTree_tree_node *node,
206  const EST_StrVector &words,
207  double count,
208  const int index)
209 {
210  /* Expand tree with new gram */
211  if (words.n() == index+1)
212  {
213  if (node->prob_dist().samples() == 0) // A new terminal node
214  node->set_state(num_states++);
215  node->cumulate(words(index),count);
216  }
217  else
218  {
221  next = pstnode(node->nodes.f(words(index),est_val(n)));
222  if (next == 0)
223  {
225  if (node->get_path() == "")
226  next->set_path(words(index));
227  else
228  {
229  next->set_path(node->get_path()+" "+ words(index));
230  }
231  next->set_level(node->get_level()-1);
232  node->nodes.set_val(words(index),est_val(next));
233  }
234  p_accumulate(next,words,count,index+1);
235  }
236 }
237 
238 double
239 EST_PredictionSuffixTree::rev_prob(const EST_StrVector &words) const
240 {
241  // Reverse probability. What is prob of this context given predictee
242  const EST_DiscreteProbDistribution &pg = prob_dist(words);
243  double d1 = pg.frequency(words(order()-1));
244  double d2 = pd->frequency(words(order()-1));
245  double d3 = d1/d2;
246  return d3;
247 }
248 
249 double
250 EST_PredictionSuffixTree::rev_prob(const EST_StrVector &words,
251  const EST_DiscreteProbDistribution &pg) const
252 {
253  // Reverse probability. What is prob of this context given predictee
254  return (double)pg.frequency(words(order()-1)) /
255  pd->frequency(words(order()-1));
256 }
257 
258 const
259 EST_String &EST_PredictionSuffixTree::predict(const EST_StrVector &words) const
260 {
261  double p;
262  int state;
263  return ppredict(nodes,words,&p,&state);
264 }
265 
266 const
267 EST_String &EST_PredictionSuffixTree::predict(const EST_StrVector &words,double *p) const
268 {
269  int state;
270  return ppredict(nodes,words,p,&state);
271 }
272 
273 const
274 EST_String &EST_PredictionSuffixTree::predict(const EST_StrVector &words,double *p, int *state) const
275 {
276  return ppredict(nodes,words,p,state);
277 }
278 
279 const
280 EST_String &EST_PredictionSuffixTree::ppredict(EST_PredictionSuffixTree_tree_node *node,
281  const EST_StrVector &words,
282  double *p,int *state,
283  const int index) const
284 {
285  /* Given the context whats the most probably symbol (or OOV) */
286  if (words.n() == index+1)
287  {
288  *state = node->get_state();
289  return node->most_probable(p);
290  }
291  else
292  {
295  next = pstnode(node->nodes.f(words(index),est_val(n)));
296  if (next == 0)
297  {
298  *p = 0.0;
299  *state = 0; // No not really acceptable
300  return PredictionSuffixTree_oov; /* utterly improbable -- i.e. not in EST_PredictionSuffixTree */
301  }
302  else
303  return ppredict(next,words,p,state,index+1);
304  }
305 }
306 
307 void
308 EST_PredictionSuffixTree::print_freqs(ostream &os)
309 {
310  // Print a list of all PredictionSuffixTrees with their frequencies
311 
312  os << "EST_PredictionSuffixTree order=" << p_order << endl;
313  nodes->print_freqs(os);
314 
315 }
316 
317 void
318 EST_PredictionSuffixTree::print_probs(ostream &os)
319 {
320  // Print a list of all grams(-last) with their probability distributions
321 
322  os << "EST_PredictionSuffixTree " << p_order << endl;
323  nodes->print_probs(os);
324 
325 }
326 
327 int
328 EST_PredictionSuffixTree::save(const EST_String filename, const EST_PredictionSuffixTree::EST_filetype type)
329 {
330  // Save an EST_PredictionSuffixTree to file -- actual the grams and frequencies
331  (void)type;
332 
333  if (filename == "-")
334  print_freqs(cout);
335  else
336  {
337  ofstream os(filename);
338  print_freqs(os);
339  }
340  return 0;
341 }
342 
343 int
344 EST_PredictionSuffixTree::load(const EST_String filename)
345 {
346  // Load an EST_PredictionSuffixTree for given file
347  EST_TokenStream ts;
348  int i,order,freq;
349 
350  clear();
351  if (ts.open(filename) != 0)
352  {
353  cerr << "EST_PredictionSuffixTree: failed to open \"" << filename << "\" for reading\n";
354  return -1;
355  }
356  ts.set_SingleCharSymbols(":");
357 
358  if (ts.get() != "EST_PredictionSuffixTree") // magic number
359  {
360  cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" not an EST_PredictionSuffixTree\n";
361  return -1;
362  }
363 
364  order = atoi(ts.get().string());
365  if ((order < 1) || (order > 10))
366  {
367  cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" has bad order\n";
368  return -1;
369  }
370  init(order);
371  //EST_String const* *window = new EST_String const*[order+1];
372  EST_StrVector window(order);
373  for (i=0; i<p_order; i++)
374  window[i] = "";
375 
376  while (!ts.eof())
377  {
378  slide(window,-1);
379  window[p_order-1] = ts.get().string();
380  if (ts.get() != ":")
381  {
382  cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" missed parsed line ";
383  cerr << ts.linenum() << " near EST_PredictionSuffixTree\n";
384  for (i=0; i < order; i++)
385  cout << " " << window(i);
386  cout << endl;
387  }
388 
389  freq = atoi(ts.get().string());
390  accumulate(window,freq);
391  }
392 
393  return 0;
394 }
395 
396 void
397 EST_PredictionSuffixTree::build(const EST_String filename,
398  const EST_String prev,
399  const EST_String prev_prev,
400  const EST_String last)
401 {
402  EST_TokenStream ts;
403  int i;
404 
405  if (filename == "-")
406  ts.open(stdin, FALSE);
407  else if (ts.open(filename) == -1)
408  return;
409 
410  //EST_String const* *window = new EST_String const*[p_order+1];
411  EST_StrVector window(p_order);
412  for (i=0; i<p_order-1; i++)
413  window[i] = prev_prev;
414  window[p_order-1] = prev;
415 
416  accumulate(window,1);
417 
418  //window[p_order] = 0; // end of array marker
419 
420  while (!ts.eof())
421  {
422  slide(window,-1);
423  window[p_order-1] = ts.get().string();
424  accumulate(window,1);
425  }
426 
427  // and finish off
428 
429  slide(window,-1);
430  window[p_order-1] = last;
431  accumulate(window,1);
432 
433 }
434 
435 void
436 EST_PredictionSuffixTree::build(const EST_StrList &input)
437 {
438 
439  // knackered - fix
440 
441 
442  // adapted from build(..) above
443  // but could be made more efficient
444 
445  int i;
446  //EST_String const* *window = new EST_String const *[p_order+1];
447  EST_StrVector window(p_order);
448  for (i=0; i<p_order; i++)
449  window[i] = "";
450 
451  EST_Litem *i_ptr;
452  for(i_ptr=input.head();i_ptr!=NULL;i_ptr=i_ptr->next())
453  {
454  slide(window,-1);
455  window[p_order-1] = input(i_ptr);
456  accumulate(window,1);
457  }
458 }
459 
460 
461 void
462 EST_PredictionSuffixTree::test(const EST_String filename)
463 {
464  // Run the current EST_PredictionSuffixTree over the given file name and generate
465  // statistics of its prediction power for the contents of that
466  // file. Use the confusion function
467  EST_StrStr_KVL pairs;
468  EST_StrList lex;
469  EST_TokenStream ts;
470  int i;
471 
472  if (filename == "-")
473  ts.open(stdin, FALSE);
474  else if (ts.open(filename) == -1)
475  return;
476 
477  /* The top level nodes list will always contain all the tokens */
478  /* Build the lexicon for confusion matrix */
480  for (p.begin(nodes->nodes); p; p++)
481  lex.append(p->k);
482  lex.append("_OOV_");
483 
484  EST_StrVector window(p_order);
485  //EST_String const* *window = new EST_String const*[p_order+1];
486  for (i=0; i<p_order; i++)
487  window[i] = "";
488  double e=0;
489  int num_tsamples = 0;
490 
491  while (!ts.eof())
492  {
493  slide(window,-1);
494  window[p_order-1] = ts.get().string();
495  const EST_DiscreteProbDistribution &pdist = prob_dist(window);
496  e += pdist.entropy();
497  num_tsamples++;
498  // cumulate real and predicted
499  pairs.add_item(window(p_order-1),predict(window),1);
500  }
501 
502  const EST_FMatrix &m = confusion(pairs,lex);
503  print_confusion(m,pairs,lex);
504  cout << "Mean entropy (?) is " << e/num_tsamples << endl;
505 
506 
507 }
508