Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
ngrammar_aux.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Auxiliary functions for EST_Ngram class */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <cstring>
42 #include "EST_String.h"
43 #include "EST_Ngrammar.h"
44 
45 static bool
46 ExponentialFit(EST_DVector &N, double &a, double &b, int first, int last)
47 {
48  // fit the function N(r) = e^a * r^b where a,b are constants
49  // to the points N(first)...N(last) inclusive
50  // i.e. a straight line fit in the (natural) log domain
51  // minimising the mean squared error (in log domain), is :
52 
53  // limitation : r must be an integer (it is the index of v)
54 
55 
56  // default
57  if (last == -1)
58  last = N.n()-1;
59 
60  if (first < 0)
61  {
62  cerr << "ExponentialFit : first must be >= 0" << endl;
63  return false;
64  }
65 
66  if (last >= N.n()-1)
67  {
68  cerr << "ExponentialFit : last must be < N.n()-1 = " << N.n()-1 << endl;
69  }
70 
71  if (first == last)
72  {
73  a=log(N(first));
74  b=0;
75  return true;
76  }
77 
78  double ElnNr=0.0,ElnNrlnr=0.0,
79  Elnr=0.0,Elnr2=0.0,
80  R=0.0;
81 
82  for(int r=first;r<=last;r++)
83  {
84  R += 1;
85  if ( N(r) > 0)
86  {
87  ElnNr += log( N(r) );
88  ElnNrlnr += log( N(r) ) * log( (double)r );
89  }
90  Elnr += log( (double)r );
91  Elnr2 += log( (double)r ) * log( (double)r );
92  }
93 
94  // and the answer is :
95  b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
96  a = (ElnNr - (b*Elnr) ) / R;
97 
98  return true;
99 }
100 
101 static bool
102 smooth_ExponentialFit(EST_DVector &N, int first, int last)
103 {
104  double a=0.0,b=0.0;
105 
106  if (!ExponentialFit(N,a,b,first,last))
107  {
108  cerr << "smooth_ExponentialFit : ExponentialFit failed !" << endl;
109  return false;
110  }
111 
112  for(int r=first;r<=last;r++)
113  N[r] = exp(a)* pow((double)r, b);
114 
115  return true;
116 }
117 
118 void make_f_of_f(EST_BackoffNgrammarState *s,void *params)
119 {
120  EST_Litem *k;
121  double freq;
122  EST_String name;
123 
124  EST_DVector *ff = (EST_DVector*)params;
125  int max = ff->n();
126  for (k=s->pdf_const().item_start();
127  !s->pdf_const().item_end(k);
128  k = s->pdf_const().item_next(k))
129  {
130  s->pdf_const().item_freq(k,name,freq);
131  if(freq+0.5 < max)
132  (*ff)[(int)(freq+0.5)] += 1;
133 
134  }
135 
136 
137 }
138 
139 void get_max_f(EST_BackoffNgrammarState *s,void *params)
140 {
141  EST_Litem *k;
142  double freq;
143  EST_String name;
144 
145  double *max = (double*)params;
146  for (k=s->pdf_const().item_start();
147  !s->pdf_const().item_end(k);
148  k = s->pdf_const().item_next(k))
149  {
150  s->pdf_const().item_freq(k,name,freq);
151  if(freq > *max)
152  *max = freq;
153 
154  }
155 
156 
157 }
158 
159 void map_f_of_f(EST_BackoffNgrammarState *s,void *params)
160 {
161  EST_Litem *k;
162  double freq;
163  EST_String name;
164 
165  //cerr << "map_f_of_f : visited " << *s << endl;
166 
167  EST_DVector *map = (EST_DVector*)params;
168  int max = map->n();
169  for (k=s->pdf_const().item_start();
170  !s->pdf_const().item_end(k);
171  k = s->pdf_const().item_next(k))
172  {
173  s->pdf_const().item_freq(k,name,freq);
174  if (freq+0.5 < max)
175  {
176  double nfreq = (*map)((int)(freq+0.5));
177  s->pdf().set_frequency(name,nfreq);
178 
179  }
180 
181  }
182 
183 }
184 
185 void zero_small_f(EST_BackoffNgrammarState *s,void *params)
186 {
187  EST_Litem *k;
188  double freq;
189  EST_String name;
190 
191  double *min = (double*)params;
192  for (k=s->pdf_const().item_start();
193  !s->pdf_const().item_end(k);
194  k = s->pdf_const().item_next(k))
195  {
196  s->pdf_const().item_freq(k,name,freq);
197  if (freq < *min)
198  {
199  //cerr << "zeroing " << freq << " < " << *min << endl;
200  s->pdf().override_frequency(k,0.0);
201  }
202  }
203 }
204 
205 void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order)
206 {
207  int i,size;
208  EST_Litem *k;
209  double max=0.0;
210 
211  // if ff has zero size, do complete frequency of frequencies
212  // otherwise just fill in frequencies from 1 to ff.n()-1
213 
214  bool complete = (bool)(ff.n() == 0);
215 
216  switch(n.representation())
217  {
218 
219  case EST_Ngrammar::sparse:
220  case EST_Ngrammar::dense:
221  {
222  size = n.num_states();
223  if (complete)
224  {
225  // find highest frequency in EST_Ngram
226  for(i=0;i<size;i++)
227  if (n.p_states[i].pdf().samples() > max)
228  max = n.p_states[i].pdf().samples();
229 
230  ff.resize((int)(max+1.5));
231  ff.fill(0.0);
232  }
233 
234  // Sum the frequencies
235  for(i=0;i<size;i++)
236  {
237  for (k=n.p_states[i].pdf().item_start();
238  !n.p_states[i].pdf().item_end(k);
239  k = n.p_states[i].pdf().item_next(k))
240  {
241  EST_String name;
242  double freq;
243  n.p_states[i].pdf().item_freq(k,name,freq);
244  ff[(int)(freq+0.5)] += 1;
245  }
246  }
247 
248  if (complete)
249  {
250  // fill in ff(0) - the number of unseen ngrams
251 
252  double total=0;
253  for (i=1;i<ff.n();i++)
254  total += ff(i);
255 
256  ff[0] = pow(float(n.get_vocab_length()),float(n.order())) - total;
257  }
258  }
259  break;
260 
261 
262  case EST_Ngrammar::backoff:
263  {
264  if (complete)
265  {
266  n.backoff_traverse(n.backoff_representation,
267  &get_max_f,(void*)(&max),
268  this_order-1);
269  ff.resize((int)(max+1.5));
270  }
271 
272  // backoff grammar has grammars of orders
273  // order, order-1, ..., 1
274  // and we treat each order independently
275 
276  for (i=0;i<ff.n();i++)
277  ff[i] = 0;
278 
279  n.backoff_traverse(n.backoff_representation,
280  &make_f_of_f,(void*)(&ff),
281  this_order-1);
282 
283  if (complete)
284  {
285  // fill in ff(0) - the number of unseen ngrams
286  double total=0;
287  for (i=1;i<ff.n();i++)
288  total += ff(i);
289  ff[0] = pow(float(n.get_vocab_length()),float(this_order)) - total;
290 
291 
292 
293  }
294  }
295  break;
296 
297  default:
298  cerr << "unknown representation for EST_Ngrammar" << endl;
299  break;
300  }
301 
302 }
303 
304 void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
305 {
306  int i;
307  EST_Litem *k;
308 
309  switch(n.representation())
310  {
311 
312  case EST_Ngrammar::sparse:
313  case EST_Ngrammar::dense:
314  {
315  int size = n.p_num_states;
316 
317  for(i=0;i<size;i++)
318  {
319  for (k=n.p_states[i].pdf().item_start();
320  !n.p_states[i].pdf().item_end(k);
321  k = n.p_states[i].pdf().item_next(k))
322  {
323  EST_String name;
324  double freq,nfreq;
325  n.p_states[i].pdf().item_freq(k,name,freq);
326  nfreq = map((int)(freq+0.5));
327  n.p_states[i].pdf().set_frequency(name,nfreq);
328 
329 
330  }
331  }
332  }
333  break;
334 
335  case EST_Ngrammar::backoff:
336  {
337 
338  // backoff grammar has grammars of orders
339  // order, order-1, ..., 1
340  // and we treat each order independently
341  n.backoff_traverse(n.backoff_representation,
342  &map_f_of_f,(void*)(&map),
343  this_order-1);
344 
345  }
346  break;
347 
348  default:
349  cerr << "unknown representation for EST_Ngrammar" << endl;
350  break;
351 
352  }
353 }
354 
355 static void
356 adjusted_frequencies_BasicGoodTuring(EST_DVector &M,
357  const EST_DVector &N,
358  int maxcount)
359 {
360  // produce a map of frequencies, given a (smoothed) distribution
361  // only map up to a frequency of maxcount; beyond that
362  // map(r) == r
363 
364  if (maxcount > N.n()-2)
365  {
366  maxcount = N.n()-2;
367  cerr << "adjusted_frequencies_BasicGoodTuring :";
368  cerr << " maxcount is too big, reducing it to " << maxcount << endl;
369  }
370 
371  M.resize(N.n());
372  int r;
373  for(r=0; r<=maxcount;r++)
374  {
375  // don't like this bit, but what can you do ?
376  if( (N(r+1) == 0) || (N(r) == 0) )
377  M[r] = r;
378  else
379  M[r] = (r + 1) * N(r+1) / N(r);
380  }
381  // and do not map higher counts
382  for(;r<N.n();r++)
383  M[r]=r;
384 
385  return;
386 
387 }
388 
389 static void
390 smoothed_frequency_distribution_ExponentialFit(EST_DVector &N,int maxcount)
391 {
392  if (maxcount > N.n()-2)
393  {
394  maxcount = N.n()-2;
395  cerr << "smoothed_frequency_distribution_ExponentialFit :"
396  << " maxcount too big, reducing it to " << maxcount << endl;
397  }
398  // the idea to fit this particular fn was by Steve Isard
399  // we ignore r=0 in doing the fit
400 
401  if (!smooth_ExponentialFit(N,1,maxcount+1))
402  cerr << "smooth_ExponentialFit failed !" << endl;
403 
404  return;
405 }
406 
407 bool
408 Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
409 {
410  // works for any N
411  // since it just remaps frequencies
412 
413  // frequencies above maxcount are not changed
414  // for discounting -- see discount routine !
415 
416 
417  if (ngrammar.entry_type() != EST_Ngrammar::frequencies)
418  {
419  cerr << "EST_Ngram: cannot Good-Turing smooth ngram:" <<
420  " entries are not frequencies" << endl;
421  return false;
422  }
423 
424  switch(ngrammar.representation())
425  {
426 
427  case EST_Ngrammar::sparse:
428  case EST_Ngrammar::dense:
429  {
430  EST_DVector freqs,mapped_freqs;
431  // grammar is of a single order - simple
432  // Find frequency distribution
433  frequency_of_frequencies(freqs,ngrammar,0);
434  // smoothing should be optional - to do
435  smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
436  // Build map of frequencies
437  adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
438  // Map all frequencies in grammar to Good Turing Smoothed values
439  map_frequencies(ngrammar,mapped_freqs,0);
440 
441  }
442  break;
443 
444  case EST_Ngrammar::backoff:
445 {
446 
447  cerr << "Smoothing of backed of grammars is not available!" << endl;
448  return false;
449 
450  (void)mincount;
451 
452  /*
453  // need to smooth for each order independently
454  int i,o;
455  double threshold;
456 
457  //for (o=1;o<=ngrammar.order();o++)
458  for(o=ngrammar.order();o<=ngrammar.order();o++)
459  {
460 
461  EST_DVector freqs,mapped_freqs;
462 
463  frequency_of_frequencies(freqs,ngrammar,o);
464 
465  cerr << "FREQ : " << freqs << endl;
466 
467  if(freqs.n() < 2)
468  {
469  // i.e. only unseen things - zero them all
470  threshold = 2; // 1 ?
471  if(o>1)
472  ngrammar.backoff_traverse(ngrammar.backoff_representation,
473  &zero_small_f,
474  (void*)(&threshold),
475  o-1);
476  continue;
477  }
478 
479  int max = maxcount;
480  if(max > freqs.n() - 2)
481  max = freqs.n() - 2;
482 
483  if(max > 2)
484  // need at least 3 points
485  // smoothing should be optional - to do
486  smoothed_frequency_distribution_ExponentialFit(freqs,max);
487 
488  cerr << "SMOOTHED : " << freqs << endl;
489 
490 
491  adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
492 
493  // initial map of frequencies modifies total counts
494  // so pdfs are correct
495 
496  map_frequencies(ngrammar,mapped_freqs,o);
497 
498 
499  cerr << "Map for " << o << " : ";
500  for(i=0;i<max;i++)
501  cerr << mapped_freqs(i) << " ";
502  cerr << endl;
503 
504  cerr << "MAP : " << mapped_freqs << endl;
505 
506  // now go and zero the small frequencies
507  // but not for unigram
508 
509 
510  if(mincount < mapped_freqs.n())
511  threshold = mapped_freqs(mincount);
512  else
513  threshold = mapped_freqs(mapped_freqs.n()-1) + 1; // i.e. zero everything
514 
515  //cerr << "min = " << threshold << endl;
516 
517  if(o>1)
518  ngrammar.backoff_traverse(ngrammar.backoff_representation,
519  &zero_small_f,
520  (void*)(&threshold),
521  o-1);
522  */
523 
524 }
525 break;
526 
527 default:
528 cerr << "unknown representation for EST_Ngrammar" << endl;
529 break;
530 
531 }
532 
533 return true;
534 
535 }
536 
537 
538 void
539 Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
540  const double default_discount)
541 {
542 
543  if(ngrammar.representation() != EST_Ngrammar::backoff)
544  {
545  cerr << "Good_Turing_discount is not appropriate for non backoff grammar !"
546  << endl;
547  return;
548  }
549 
550 
551 
552  // method (for each grammar order):
553  // produce Good-Turing smoothed frequency map
554  // compute difference between unsmoothed map (0,1,2,3...)
555  // and GT map - this is the discount
556  // we do not subtract the discount here since we need to
557  // preserve the raw counts
558 
559  // discounting is applied up to frequency 'maxcount'
560  // beyond which we do not discount
561 
562  // to do : zero low frequencies ???? (prune the tree)
563  /*
564  ngrammar.backoff_traverse(ngrammar.backoff_representation,
565  &zero_small_f,
566  (void*)(&threshold),
567  o-1);
568  */
569 
570  // need to treat for each order independently
571  int i,o;
572 
573  // no discounting for order 1 - YES !?!?!?
574  for (o=1;o<=ngrammar.order();o++)
575  {
576 
577  EST_DVector freqs,mapped_freqs;
578  frequency_of_frequencies(freqs,ngrammar,o);
579 
580  int max = maxcount;
581  if(max > freqs.n() - 2)
582  max = freqs.n() - 2;
583 
584  if(max > 2)
585  {
586  // need at least 3 points
587  // smoothing should be optional - to do
588 
589  // April 97
590  // to overcome problems with N(x)=0, shift data points, fit, and shift back
591 
592  for(i=0;i<=max+1;i++)
593  freqs[i] += 1;
594 
595  smoothed_frequency_distribution_ExponentialFit(freqs,max);
596 
597  for(i=0;i<=max+1;i++)
598  {
599  freqs[i] -= 1;
600  // possibly unnecesary check
601  if ( freqs(i) < 0 )
602  freqs[i] = 0;
603  }
604  }
605 
606  adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
607 
608  // and put it in the discount array
609  ngrammar.backoff_discount[o-1].resize(freqs.n());
610  for(i=(int)ngrammar.backoff_threshold;i<=max;i++)
611  {
612  ngrammar.backoff_discount[o-1][i] = (double)i - mapped_freqs(i);
613 
614  // sanity check
615  if( ngrammar.backoff_discount[o-1][i] < 0)
616  {
617  ngrammar.backoff_discount[o-1][i] = 0;
618  }
619  }
620 
621  for(;i<freqs.n();i++)
622  ngrammar.backoff_discount[o-1][i] = default_discount;
623 
624  }
625 }
626 
627 VAL_REGISTER_CLASS(ngrammar,EST_Ngrammar)
628