Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
freqsmooth.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Paul Taylor & Simon King & Alan W Black */
34 /* Date : April 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Yet another method for smoothing an ngram */
38 /* */
39 /* Good_Turing smooth n-grammar so there are no zero occurrences */
40 /* For each ngram */
41 /* if |n-1gram| < threshold */
42 /* F(ngram) = !n-1gram|*(P(n-1gram)) */
43 /* */
44 /*=======================================================================*/
45 #include <iostream>
46 #include <cstring>
47 #include <cmath>
48 #include <climits>
49 #include <cfloat>
50 #include "EST_Ngrammar.h"
51 
52 static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
53  int order,const EST_StrVector words,
54  int smooth_thresh);
55 
56 void Ngram_freqsmooth(EST_Ngrammar &ngram,int smooth_thresh1,
57  int smooth_thresh2)
58 {
59  // To assign reasonable frequencies for all ngrams in grammar
60  EST_Ngrammar *backoff_ngrams;
61  backoff_ngrams = new EST_Ngrammar[ngram.order()-1];
62 
63  Good_Turing_smooth(ngram,smooth_thresh1,0);
64 
65  fs_build_backoff_ngrams(backoff_ngrams,ngram);
66 
67  fs_backoff_smooth(backoff_ngrams,ngram,smooth_thresh2);
68 
69  delete [] backoff_ngrams;
70 
71 }
72 
73 void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
74  EST_Ngrammar &ngram)
75 {
76  // Build all the backoff grammars back to uni-grams
77  int i,j,l;
78  EST_Litem *k;
79 
80  for (i=0; i < ngram.order()-1; i++)
81  backoff_ngrams[i].init(i+1,EST_Ngrammar::dense,
82  *ngram.vocab,*ngram.pred_vocab);
83 
84  for (i=0; i < ngram.num_states(); i++)
85  {
86  const EST_StrVector words = ngram.make_ngram_from_index(i);
87 
88  for (k=ngram.p_states[i].pdf().item_start();
89  !ngram.p_states[i].pdf().item_end(k);
90  k = ngram.p_states[i].pdf().item_next(k))
91  {
92  double freq;
93  EST_String name;
94  ngram.p_states[i].pdf().item_freq(k,name,freq);
95  // Build all the sub-ngrams and accumulate them
96  for (j=0; j < ngram.order()-1; j++)
97  {
98  EST_StrVector nnn(j+1);
99  nnn[j] = name;
100  for (l=0; l < j; l++)
101  nnn[l] = words(ngram.order()-1-j);
102  backoff_ngrams[j].accumulate(nnn,freq);
103  }
104  }
105  }
106 
107 }
108 
109 int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
110  EST_Ngrammar &ngram, int smooth_thresh)
111 {
112  // For all ngrams which are too infrequent, adjust their
113  // frequencies based on their backoff probabilities
114  int i;
115  EST_Litem *j;
116  double occurs;
117  double backoff_prob;
118 
119  if (ngram.representation() != EST_Ngrammar::dense)
120  {
121  cerr << "Ngrammar: can only ptsmooth dense ngrammars" << endl;
122  return FALSE;
123  }
124  else
125  {
126  for (i=0; i < ngram.num_states(); i++)
127  {
128  if (ngram.p_states[i].pdf().samples() < smooth_thresh)
129  {
130  EST_DiscreteProbDistribution &pdf = ngram.p_states[i].pdf();
131  occurs = ngram.p_states[i].pdf().samples();
132  EST_StrVector words = ngram.make_ngram_from_index(i);
133  words.resize(words.n()+1);
134 
135  for (j=pdf.item_start();
136  ! pdf.item_end(j);
137  j = pdf.item_next(j))
138  {
139  EST_String name;
140  double freq;
141  pdf.item_freq(j,name,freq);
142  words[words.n()-1] = name;
143  backoff_prob =
144  fs_find_backoff_prob(backoff_ngrams,
145  ngram.order()-1,
146  words,
147  smooth_thresh);
148  pdf.set_frequency(j,occurs*backoff_prob);
149  }
150  }
151  }
152  }
153 
154  return TRUE;
155 }
156 
157 static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
158  int order,const EST_StrVector words,
159  int smooth_thresh)
160 {
161  // Find the backoff prob for n-1gram
162  EST_StrVector nnn;
163  int i;
164 
165  if (order == 0)
166  return TINY_FREQ; // ultimate floor
167 
168  nnn.resize(order);
169 
170  for(i=0; i<order; i++)
171  nnn[order-1-i] = words(words.n()-1-i);
172 
173  if (backoff_ngrams[order-1].frequency(nnn) < smooth_thresh)
174  return fs_find_backoff_prob(backoff_ngrams,
175  order-1,words,smooth_thresh);
176  else
177  return backoff_ngrams[order-1].probability(nnn);
178 }