45 #include "EST_String.h"
46 #include "EST_types.h"
47 #include "EST_Token.h"
49 #include "EST_multistats.h"
54 const
EST_String PredictionSuffixTree_oov("_OOV_");
65 EST_PredictionSuffixTree_tree_node::print_freqs(ostream &os)
80 os << get_path() <<
" " << s <<
" : " << freq << endl;
86 for (t.
begin(nodes); t; t++)
87 pstnode(t->v)->print_freqs(os);
92 EST_PredictionSuffixTree_tree_node::print_probs(ostream &os)
101 os << get_path() <<
" :";
105 os <<
" " << s <<
" " << prob;
112 for (t.
begin(nodes); t; t++)
113 pstnode(t->v)->print_probs(os);
118 EST_PredictionSuffixTree_tree_node::most_probable(
double *prob)
const
124 EST_PredictionSuffixTree::EST_PredictionSuffixTree(
void)
133 EST_PredictionSuffixTree::init(
const int order)
138 nodes->set_level(order-1);
142 EST_PredictionSuffixTree::~EST_PredictionSuffixTree()
149 EST_PredictionSuffixTree::clear(
void)
159 const int index)
const
163 if (words.
n() == index+1){
164 return node->prob_dist();
169 next = pstnode(node->nodes.
f(words(index),est_val(n)));
174 return PSTnullProbDistribution;
176 return p_prob_dist(next,words,index+1);
182 EST_PredictionSuffixTree::accumulate(
const EST_StrVector &words,
194 if (words.
n()+index < p_order)
195 cerr <<
"EST_PredictionSuffixTree: accumlating window is wtoo small"
199 pd->
cumulate(words(p_order-1+index),count);
200 p_accumulate(nodes,words,count,index);
211 if (words.
n() == index+1)
213 if (node->prob_dist().
samples() == 0)
214 node->set_state(num_states++);
215 node->cumulate(words(index),count);
221 next = pstnode(node->nodes.
f(words(index),est_val(n)));
225 if (node->get_path() ==
"")
226 next->set_path(words(index));
229 next->set_path(node->get_path()+
" "+ words(index));
231 next->set_level(node->get_level()-1);
232 node->nodes.
set_val(words(index),est_val(next));
234 p_accumulate(next,words,count,index+1);
239 EST_PredictionSuffixTree::rev_prob(
const EST_StrVector &words)
const
243 double d1 = pg.frequency(words(order()-1));
244 double d2 = pd->frequency(words(order()-1));
250 EST_PredictionSuffixTree::rev_prob(
const EST_StrVector &words,
254 return (
double)pg.frequency(words(order()-1)) /
255 pd->frequency(words(order()-1));
263 return ppredict(nodes,words,&p,&state);
270 return ppredict(nodes,words,p,&state);
276 return ppredict(nodes,words,p,state);
282 double *p,
int *state,
283 const int index)
const
286 if (words.
n() == index+1)
288 *state = node->get_state();
289 return node->most_probable(p);
295 next = pstnode(node->nodes.
f(words(index),est_val(n)));
300 return PredictionSuffixTree_oov;
303 return ppredict(next,words,p,state,index+1);
308 EST_PredictionSuffixTree::print_freqs(ostream &os)
312 os <<
"EST_PredictionSuffixTree order=" << p_order << endl;
313 nodes->print_freqs(os);
318 EST_PredictionSuffixTree::print_probs(ostream &os)
322 os <<
"EST_PredictionSuffixTree " << p_order << endl;
323 nodes->print_probs(os);
328 EST_PredictionSuffixTree::save(
const EST_String filename,
const EST_PredictionSuffixTree::EST_filetype type)
337 ofstream os(filename);
344 EST_PredictionSuffixTree::load(
const EST_String filename)
351 if (ts.
open(filename) != 0)
353 cerr <<
"EST_PredictionSuffixTree: failed to open \"" << filename <<
"\" for reading\n";
358 if (ts.
get() !=
"EST_PredictionSuffixTree")
360 cerr <<
"EST_PredictionSuffixTree: file \"" << filename <<
"\" not an EST_PredictionSuffixTree\n";
364 order = atoi(ts.
get().string());
365 if ((order < 1) || (order > 10))
367 cerr <<
"EST_PredictionSuffixTree: file \"" << filename <<
"\" has bad order\n";
373 for (i=0; i<p_order; i++)
379 window[p_order-1] = ts.
get().string();
382 cerr <<
"EST_PredictionSuffixTree: file \"" << filename <<
"\" missed parsed line ";
383 cerr << ts.
linenum() <<
" near EST_PredictionSuffixTree\n";
384 for (i=0; i < order; i++)
385 cout <<
" " << window(i);
389 freq = atoi(ts.
get().string());
390 accumulate(window,freq);
397 EST_PredictionSuffixTree::build(
const EST_String filename,
406 ts.
open(stdin, FALSE);
407 else if (ts.
open(filename) == -1)
412 for (i=0; i<p_order-1; i++)
413 window[i] = prev_prev;
414 window[p_order-1] = prev;
416 accumulate(window,1);
423 window[p_order-1] = ts.
get().string();
424 accumulate(window,1);
430 window[p_order-1] = last;
431 accumulate(window,1);
436 EST_PredictionSuffixTree::build(
const EST_StrList &input)
448 for (i=0; i<p_order; i++)
452 for(i_ptr=input.head();i_ptr!=NULL;i_ptr=i_ptr->next())
455 window[p_order-1] = input(i_ptr);
456 accumulate(window,1);
462 EST_PredictionSuffixTree::test(
const EST_String filename)
473 ts.
open(stdin, FALSE);
474 else if (ts.
open(filename) == -1)
480 for (p.
begin(nodes->nodes); p; p++)
486 for (i=0; i<p_order; i++)
489 int num_tsamples = 0;
494 window[p_order-1] = ts.
get().string();
499 pairs.
add_item(window(p_order-1),predict(window),1);
503 print_confusion(m,pairs,lex);
504 cout <<
"Mean entropy (?) is " << e/num_tsamples << endl;