42 #include "EST_String.h"
43 #include "EST_Ngrammar.h"
46 ExponentialFit(
EST_DVector &N,
double &a,
double &b,
int first,
int last)
62 cerr <<
"ExponentialFit : first must be >= 0" << endl;
68 cerr <<
"ExponentialFit : last must be < N.n()-1 = " << N.
n()-1 << endl;
78 double ElnNr=0.0,ElnNrlnr=0.0,
82 for(
int r=first;r<=last;r++)
88 ElnNrlnr += log( N(r) ) * log( (
double)r );
90 Elnr += log( (
double)r );
91 Elnr2 += log( (
double)r ) * log( (
double)r );
95 b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
96 a = (ElnNr - (b*Elnr) ) / R;
102 smooth_ExponentialFit(
EST_DVector &N,
int first,
int last)
106 if (!ExponentialFit(N,a,b,first,last))
108 cerr <<
"smooth_ExponentialFit : ExponentialFit failed !" << endl;
112 for(
int r=first;r<=last;r++)
113 N[r] = exp(a)* pow((
double)r, b);
132 (*ff)[(int)(freq+0.5)] += 1;
145 double *max = (
double*)params;
176 double nfreq = (*map)((int)(freq+0.5));
191 double *min = (
double*)params;
214 bool complete = (bool)(ff.
n() == 0);
216 switch(n.representation())
219 case EST_Ngrammar::sparse:
220 case EST_Ngrammar::dense:
222 size = n.num_states();
227 if (n.p_states[i].pdf().
samples() > max)
228 max = n.p_states[i].pdf().
samples();
230 ff.
resize((
int)(max+1.5));
243 n.p_states[i].pdf().
item_freq(k,name,freq);
244 ff[(int)(freq+0.5)] += 1;
253 for (i=1;i<ff.
n();i++)
256 ff[0] = pow(
float(n.get_vocab_length()),
float(n.order())) - total;
262 case EST_Ngrammar::backoff:
266 n.backoff_traverse(n.backoff_representation,
267 &get_max_f,(
void*)(&max),
269 ff.
resize((
int)(max+1.5));
276 for (i=0;i<ff.
n();i++)
279 n.backoff_traverse(n.backoff_representation,
280 &make_f_of_f,(
void*)(&ff),
287 for (i=1;i<ff.
n();i++)
289 ff[0] = pow(
float(n.get_vocab_length()),
float(this_order)) - total;
298 cerr <<
"unknown representation for EST_Ngrammar" << endl;
309 switch(n.representation())
312 case EST_Ngrammar::sparse:
313 case EST_Ngrammar::dense:
315 int size = n.p_num_states;
325 n.p_states[i].pdf().
item_freq(k,name,freq);
326 nfreq = map((
int)(freq+0.5));
335 case EST_Ngrammar::backoff:
341 n.backoff_traverse(n.backoff_representation,
342 &map_f_of_f,(
void*)(&map),
349 cerr <<
"unknown representation for EST_Ngrammar" << endl;
356 adjusted_frequencies_BasicGoodTuring(
EST_DVector &M,
364 if (maxcount > N.
n()-2)
367 cerr <<
"adjusted_frequencies_BasicGoodTuring :";
368 cerr <<
" maxcount is too big, reducing it to " << maxcount << endl;
373 for(r=0; r<=maxcount;r++)
376 if( (N(r+1) == 0) || (N(r) == 0) )
379 M[r] = (r + 1) * N(r+1) / N(r);
390 smoothed_frequency_distribution_ExponentialFit(
EST_DVector &N,
int maxcount)
392 if (maxcount > N.
n()-2)
395 cerr <<
"smoothed_frequency_distribution_ExponentialFit :"
396 <<
" maxcount too big, reducing it to " << maxcount << endl;
401 if (!smooth_ExponentialFit(N,1,maxcount+1))
402 cerr <<
"smooth_ExponentialFit failed !" << endl;
408 Good_Turing_smooth(
EST_Ngrammar &ngrammar,
int maxcount,
int mincount)
417 if (ngrammar.entry_type() != EST_Ngrammar::frequencies)
419 cerr <<
"EST_Ngram: cannot Good-Turing smooth ngram:" <<
420 " entries are not frequencies" << endl;
424 switch(ngrammar.representation())
427 case EST_Ngrammar::sparse:
428 case EST_Ngrammar::dense:
433 frequency_of_frequencies(freqs,ngrammar,0);
435 smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
437 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
439 map_frequencies(ngrammar,mapped_freqs,0);
444 case EST_Ngrammar::backoff:
447 cerr <<
"Smoothing of backed of grammars is not available!" << endl;
528 cerr <<
"unknown representation for EST_Ngrammar" << endl;
539 Good_Turing_discount(
EST_Ngrammar &ngrammar,
const int maxcount,
540 const double default_discount)
543 if(ngrammar.representation() != EST_Ngrammar::backoff)
545 cerr <<
"Good_Turing_discount is not appropriate for non backoff grammar !"
574 for (o=1;o<=ngrammar.order();o++)
578 frequency_of_frequencies(freqs,ngrammar,o);
581 if(max > freqs.
n() - 2)
592 for(i=0;i<=max+1;i++)
595 smoothed_frequency_distribution_ExponentialFit(freqs,max);
597 for(i=0;i<=max+1;i++)
606 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
609 ngrammar.backoff_discount[o-1].
resize(freqs.
n());
610 for(i=(
int)ngrammar.backoff_threshold;i<=max;i++)
612 ngrammar.backoff_discount[o-1][i] = (double)i - mapped_freqs(i);
615 if( ngrammar.backoff_discount[o-1][i] < 0)
617 ngrammar.backoff_discount[o-1][i] = 0;
621 for(;i<freqs.
n();i++)
622 ngrammar.backoff_discount[o-1][i] = default_discount;