Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
wagon.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* Added stepwise use of features, Oct 1997 */
42 /* */
43 /*=======================================================================*/
44 
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Token.h"
50 #include "EST_FMatrix.h"
51 #include "EST_multistats.h"
52 #include "EST_Wagon.h"
53 #include "EST_math.h"
54 
55 Discretes wgn_discretes;
56 
57 WDataSet wgn_dataset;
58 WDataSet wgn_test_dataset;
59 EST_FMatrix wgn_DistMatrix;
60 EST_Track wgn_VertexTrack;
61 EST_Track wgn_VertexFeats;
62 EST_Track wgn_UnitTrack;
63 
64 int wgn_min_cluster_size = 50;
65 int wgn_held_out = 0;
66 int wgn_prune = TRUE;
67 int wgn_quiet = FALSE;
68 int wgn_verbose = FALSE;
69 int wgn_count_field = -1;
70 EST_String wgn_count_field_name = "";
71 int wgn_predictee = 0;
72 EST_String wgn_predictee_name = "";
73 float wgn_float_range_split = 10;
74 float wgn_balance = 0;
75 EST_String wgn_opt_param = "";
76 EST_String wgn_vertex_output = "mean";
77 EST_String wgn_vertex_otype = "mean";
78 
79 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
80 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
81 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
82 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
83 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
84 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
85 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
86 static int wagon_split(int margin,WNode &node);
87 static WQuestion find_best_question(WVectorVector &dset);
88 static void construct_binary_ques(int feat,WQuestion &test_ques);
89 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
90 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
91 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
92 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
93 
94 Declare_TList_T(WVector *, WVectorP)
95 
96 Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
97 
98 #if defined(INSTANTIATE_TEMPLATES)
99 // Instantiate class
100 #include "../base_class/EST_TList.cc"
101 #include "../base_class/EST_TVector.cc"
102 
103 Instantiate_TList_T(WVector *, WVectorP)
104 
105 Instantiate_TVector(WVector *)
106 
107 #endif
108 
109 void wgn_load_datadescription(EST_String fname,LISP ignores)
110 {
111  // Load field description for a file
112  wgn_dataset.load_description(fname,ignores);
113  wgn_test_dataset.load_description(fname,ignores);
114 }
115 
116 void wgn_load_dataset(WDataSet &dataset,EST_String fname)
117 {
118  // Read the data set from a filename. One vector per line
119  // Assume all numbers are numbers and non-nums are categorical
120  EST_TokenStream ts;
121  WVector *v;
122  int nvec=0,i;
123 
124  if (ts.open(fname) == -1)
125  wagon_error(EST_String("unable to open data file \"")+
126  fname+"\"");
127  ts.set_PunctuationSymbols("");
129  ts.set_SingleCharSymbols("");
130 
131  for ( ;!ts.eof(); )
132  {
133  v = new WVector(dataset.width());
134  i = 0;
135  do
136  {
137  int type = dataset.ftype(i);
138  if ((type == wndt_float) ||
139  (type == wndt_ols) ||
140  (wgn_count_field == i))
141  {
142  // need to ensure this is not NaN or Infinity
143  float f = atof(ts.get().string());
144  if (isfinite(f))
145  v->set_flt_val(i,f);
146  else
147  {
148  cout << fname << ": bad float " << f
149  << " in field " <<
150  dataset.feat_name(i) << " vector " <<
151  dataset.samples() << endl;
152  v->set_flt_val(i,0.0);
153  }
154  }
155  else if (type == wndt_binary)
156  v->set_int_val(i,atoi(ts.get().string()));
157  else if (type == wndt_cluster) /* index into distmatrix */
158  v->set_int_val(i,atoi(ts.get().string()));
159  else if (type == wndt_vector) /* index into VertexTrack */
160  v->set_int_val(i,atoi(ts.get().string()));
161  else if (type == wndt_trajectory) /* index to index and length */
162  { /* a number pointing to a vector in UnitTrack that */
163  /* has an idex into VertexTrack and a number of Vertices */
164  /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
165  /* VertexTrack and UnitTrack.a(15,1) is the number of */
166  /* frames in the unit */
167  v->set_int_val(i,atoi(ts.get().string()));
168  }
169  else if (type == wndt_ignore)
170  {
171  ts.get(); // skip it
172  v->set_int_val(i,0);
173  }
174  else // should check the different classes
175  {
176  EST_String s = ts.get().string();
177  int n = wgn_discretes.discrete(type).name(s);
178  if (n == -1)
179  {
180  cout << fname << ": bad value " << s << " in field " <<
181  dataset.feat_name(i) << " vector " <<
182  dataset.samples() << endl;
183  n = 0;
184  }
185  v->set_int_val(i,n);
186  }
187  i++;
188  }
189  while (!ts.eoln() && i<dataset.width());
190  nvec ++;
191  if (i != dataset.width())
192  {
193  wagon_error(fname+": data vector "+itoString(nvec)+" contains "
194  +itoString(i)+" parameters instead of "+
195  itoString(dataset.width()));
196  }
197  if (!ts.eoln())
198  {
199  cerr << fname << ": data vector " << nvec <<
200  " contains too many parameters instead of "
201  << dataset.width() << endl;
202  wagon_error(EST_String("extra parameter(s) from ")+
203  ts.peek().string());
204  }
205  dataset.append(v);
206  }
207 
208  cout << "Dataset of " << dataset.samples() << " vectors of " <<
209  dataset.width() << " parameters from: " << fname << endl;
210  ts.close();
211 }
212 
213 float summary_results(WNode &tree,ostream *output)
214 {
215  if (wgn_test_dataset.samples() != 0)
216  return do_summary(tree,wgn_test_dataset,output);
217  else
218  return do_summary(tree,wgn_dataset,output);
219 }
220 
221 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
222 {
223  if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
224  return test_tree_cluster(tree,ds,output);
225  else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
226  return test_tree_vector(tree,ds,output);
227  else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
228  return test_tree_trajectory(tree,ds,output);
229  else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
230  return test_tree_ols(tree,ds,output);
231  else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
232  return test_tree_class(tree,ds,output);
233  else
234  return test_tree_float(tree,ds,output);
235 }
236 
237 WNode *wgn_build_tree(float &score)
238 {
239  // Build init node and split it while reducing the impurity
240  WNode *top = new WNode();
241  int margin = 0;
242 
243  wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
244 
245  margin = 0;
246  wagon_split(margin,*top); // recursively split data;
247 
248  if (wgn_held_out > 0)
249  {
250  wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
251  top->held_out_prune();
252  }
253 
254  if (wgn_prune)
255  top->prune();
256 
257  score = summary_results(*top,0);
258 
259  return top;
260 }
261 
262 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
263 {
264  // Set data ommitting held_out percent if in is true
265  // or only including 100-held_out percent if in is false
266  int i,j;
267  EST_Litem *d;
268 
269  // Make it definitely big enough
270  data.resize(ds.length());
271 
272  for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
273  {
274  if ((in) && ((j%100) >= held_out))
275  data[i++] = ds(d);
276 // else if ((!in) && ((j%100 < held_out)))
277 // data[i++] = ds(d);
278  else if (!in)
279  data[i++] = ds(d);
280 // if ((in) && (j < held_out))
281 // data[i++] = ds(d);
282 // else if ((!in) && (j >=held_out))
283 // data[i++] = ds(d);
284  }
285  // make it the actual size, but don't reset values
286  data.resize(i,1);
287 }
288 
289 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
290 {
291  // Test tree against data to get summary of results
292  EST_StrStr_KVL pairs;
293  EST_StrList lex;
294  EST_Litem *p;
295  EST_String predict,real;
296  WNode *pnode;
297  double H=0,prob;
298  int i,type;
299  float correct=0,total=0, count=0;
300 
301  float bcorrect=0, bpredicted=0, bactual=0;
302  float precision=0, recall=0;
303 
304  for (p=dataset.head(); p != 0; p=p->next())
305  {
306  pnode = tree.predict_node((*dataset(p)));
307  predict = (EST_String)pnode->get_impurity().value();
308  if (wgn_count_field == -1)
309  count = 1.0;
310  else
311  count = dataset(p)->get_flt_val(wgn_count_field);
312  prob = pnode->get_impurity().pd().probability(predict);
313  H += (log(prob))*count;
314  type = dataset.ftype(wgn_predictee);
315  real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
316 
317  if (wgn_opt_param == "B_NB_F1")
318  {
319  //cout << real << " " << predict << endl;
320  if (real == "B")
321  bactual +=count;
322  if (predict == "B")
323  {
324  bpredicted += count;
325  if (real == predict)
326  bcorrect += count;
327  }
328  // cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
329  }
330  if (real == predict)
331  correct += count;
332  total += count;
333  pairs.add_item(real,predict,1);
334  }
335  for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
336  lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
337 
338  const EST_FMatrix &m = confusion(pairs,lex);
339 
340  if (output != NULL)
341  {
342  print_confusion(m,pairs,lex); // should be to output not stdout
343  *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
344  pow(2.0,(-1*(H/total))) << endl;
345  }
346 
347 
348  // Minus it so bigger is better
349  if (wgn_opt_param == "entropy")
350  return -pow(2.0,(-1*(H/total)));
351  else if(wgn_opt_param == "B_NB_F1")
352  {
353  if(bpredicted == 0)
354  precision = 1;
355  else
356  precision = bcorrect/bpredicted;
357  if(bactual == 0)
358  recall = 1;
359  else
360  recall = bcorrect/bactual;
361  float fmeasure = 0;
362  if((precision+recall) !=0)
363  fmeasure = 2* (precision*recall)/(precision+recall);
364  cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
365  return fmeasure;
366  }
367  else
368  return (float)correct/(float)total;
369 }
370 
371 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
372 {
373  // Test tree against data to get summary of results VECTOR
374  // distance is calculated in zscores (as the values in vector may
375  // have quite different ranges
376  WNode *leaf;
377  EST_Litem *p;
378  float predict, actual;
379  EST_SuffStats x,y,xx,yy,xy,se,e;
380  EST_SuffStats b;
381  int i,j,pos;
382  double cor,error;
383  double count;
384  EST_Litem *pp;
385 
386  for (p=dataset.head(); p != 0; p=p->next())
387  {
388  leaf = tree.predict_node((*dataset(p)));
389  pos = dataset(p)->get_int_val(wgn_predictee);
390  for (j=0; j<wgn_VertexFeats.num_channels(); j++)
391  if (wgn_VertexFeats.a(0,j) > 0.0)
392  {
393  b.reset();
394  for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
395  {
396  i = leaf->get_impurity().members.item(pp);
397  b += wgn_VertexTrack.a(i,j);
398  }
399  predict = b.mean();
400  actual = wgn_VertexTrack.a(pos,j);
401  if (wgn_count_field == -1)
402  count = 1.0;
403  else
404  count = dataset(p)->get_flt_val(wgn_count_field);
405  x.cumulate(predict,count);
406  y.cumulate(actual,count);
407  /* Normalized the error by the standard deviation */
408  if (b.stddev() == 0)
409  error = predict-actual;
410  else
411  error = (predict-actual)/b.stddev();
412  error = predict-actual; /* awb_debug */
413  se.cumulate((error*error),count);
414  e.cumulate(fabs(error),count);
415  xx.cumulate(predict*predict,count);
416  yy.cumulate(actual*actual,count);
417  xy.cumulate(predict*actual,count);
418  }
419  }
420 
421  // Pearson's product moment correlation coefficient
422 // cor = (xy.mean() - (x.mean()*y.mean()))/
423 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
424 // sqrt(yy.mean()-(y.mean()*y.mean())));
425  // Because when the variation is X is very small we can
426  // go negative, thus cause the sqrt's to give FPE
427  double v1 = xx.mean()-(x.mean()*x.mean());
428  double v2 = yy.mean()-(y.mean()*y.mean());
429 
430  double v3 = v1*v2;
431 
432  if (v3 <= 0)
433  // happens when there's very little variation in x
434  cor = 0;
435  else
436  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
437 
438  if (output != NULL)
439  {
440  if (output != &cout) // save in output file
441  *output
442  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
443  << " Correlation is " << ftoString(cor,4,1)
444  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
445  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
446 
447  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
448  << " Correlation is " << ftoString(cor,4,1)
449  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
450  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
451  }
452 
453  if (wgn_opt_param == "rmse")
454  return -sqrt(se.mean()); // * -1 so bigger is better
455  else
456  return cor; // should really be % variance, I think
457 }
458 
459 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
460 {
461  // Test tree against data to get summary of results TRAJECTORY
462  // distance is calculated in zscores (as the values in vector may
463  // have quite different ranges)
464  // NOT WRITTEN YET
465  WNode *leaf;
466  EST_Litem *p;
467  float predict, actual;
468  EST_SuffStats x,y,xx,yy,xy,se,e;
469  EST_SuffStats b;
470  int i,j,pos;
471  double cor,error;
472  double count;
473  EST_Litem *pp;
474 
475  for (p=dataset.head(); p != 0; p=p->next())
476  {
477  leaf = tree.predict_node((*dataset(p)));
478  pos = dataset(p)->get_int_val(wgn_predictee);
479  for (j=0; j<wgn_VertexFeats.num_channels(); j++)
480  if (wgn_VertexFeats.a(0,j) > 0.0)
481  {
482  b.reset();
483  for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
484  {
485  i = leaf->get_impurity().members.item(pp);
486  b += wgn_VertexTrack.a(i,j);
487  }
488  predict = b.mean();
489  actual = wgn_VertexTrack.a(pos,j);
490  if (wgn_count_field == -1)
491  count = 1.0;
492  else
493  count = dataset(p)->get_flt_val(wgn_count_field);
494  x.cumulate(predict,count);
495  y.cumulate(actual,count);
496  /* Normalized the error by the standard deviation */
497  if (b.stddev() == 0)
498  error = predict-actual;
499  else
500  error = (predict-actual)/b.stddev();
501  error = predict-actual; /* awb_debug */
502  se.cumulate((error*error),count);
503  e.cumulate(fabs(error),count);
504  xx.cumulate(predict*predict,count);
505  yy.cumulate(actual*actual,count);
506  xy.cumulate(predict*actual,count);
507  }
508  }
509 
510  // Pearson's product moment correlation coefficient
511 // cor = (xy.mean() - (x.mean()*y.mean()))/
512 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
513 // sqrt(yy.mean()-(y.mean()*y.mean())));
514  // Because when the variation is X is very small we can
515  // go negative, thus cause the sqrt's to give FPE
516  double v1 = xx.mean()-(x.mean()*x.mean());
517  double v2 = yy.mean()-(y.mean()*y.mean());
518 
519  double v3 = v1*v2;
520 
521  if (v3 <= 0)
522  // happens when there's very little variation in x
523  cor = 0;
524  else
525  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
526 
527  if (output != NULL)
528  {
529  if (output != &cout) // save in output file
530  *output
531  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
532  << " Correlation is " << ftoString(cor,4,1)
533  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
534  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
535 
536  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
537  << " Correlation is " << ftoString(cor,4,1)
538  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
539  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
540  }
541 
542  if (wgn_opt_param == "rmse")
543  return -sqrt(se.mean()); // * -1 so bigger is better
544  else
545  return cor; // should really be % variance, I think
546 }
547 
548 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
549 {
550  // Test tree against data to get summary of results for cluster trees
551  WNode *leaf;
552  int real;
553  int right_cluster=0;
554  EST_SuffStats ranking, meandist;
555  EST_Litem *p;
556 
557  for (p=dataset.head(); p != 0; p=p->next())
558  {
559  leaf = tree.predict_node((*dataset(p)));
560  real = dataset(p)->get_int_val(wgn_predictee);
561  meandist += leaf->get_impurity().cluster_distance(real);
562  right_cluster += leaf->get_impurity().in_cluster(real);
563  ranking += leaf->get_impurity().cluster_ranking(real);
564  }
565 
566  if (output != NULL)
567  {
568  // Want number in right class, mean distance in sds, mean ranking
569  if (output != &cout) // save in output file
570  *output << ";; Right cluster " << right_cluster << " (" <<
571  (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
572  "%) mean ranking " << ranking.mean() << " mean distance "
573  << meandist.mean() << endl;
574  cout << "Right cluster " << right_cluster << " (" <<
575  (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
576  "%) mean ranking " << ranking.mean() << " mean distance "
577  << meandist.mean() << endl;
578  }
579 
580  return 10000-meandist.mean(); // this doesn't work but I tested it
581 }
582 
583 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
584 {
585  // Test tree against data to get summary of results FLOAT
586  EST_Litem *p;
587  float predict,real;
588  EST_SuffStats x,y,xx,yy,xy,se,e;
589  double cor,error;
590  double count;
591 
592  for (p=dataset.head(); p != 0; p=p->next())
593  {
594  predict = tree.predict((*dataset(p)));
595  real = dataset(p)->get_flt_val(wgn_predictee);
596  if (wgn_count_field == -1)
597  count = 1.0;
598  else
599  count = dataset(p)->get_flt_val(wgn_count_field);
600  x.cumulate(predict,count);
601  y.cumulate(real,count);
602  error = predict-real;
603  se.cumulate((error*error),count);
604  e.cumulate(fabs(error),count);
605  xx.cumulate(predict*predict,count);
606  yy.cumulate(real*real,count);
607  xy.cumulate(predict*real,count);
608  }
609 
610  // Pearson's product moment correlation coefficient
611 // cor = (xy.mean() - (x.mean()*y.mean()))/
612 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
613 // sqrt(yy.mean()-(y.mean()*y.mean())));
614  // Because when the variation is X is very small we can
615  // go negative, thus cause the sqrt's to give FPE
616  double v1 = xx.mean()-(x.mean()*x.mean());
617  double v2 = yy.mean()-(y.mean()*y.mean());
618 
619  double v3 = v1*v2;
620 
621  if (v3 <= 0)
622  // happens when there's very little variation in x
623  cor = 0;
624  else
625  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
626 
627  if (output != NULL)
628  {
629  if (output != &cout) // save in output file
630  *output
631  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
632  << " Correlation is " << ftoString(cor,4,1)
633  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
634  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
635 
636  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
637  << " Correlation is " << ftoString(cor,4,1)
638  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
639  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
640  }
641 
642  if (wgn_opt_param == "rmse")
643  return -sqrt(se.mean()); // * -1 so bigger is better
644  else
645  return cor; // should really be % variance, I think
646 }
647 
648 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
649 {
650  // Test tree against data to get summary of results OLS
651  EST_Litem *p;
652  WNode *leaf;
653  float predict,real;
654  EST_SuffStats x,y,xx,yy,xy,se,e;
655  double cor,error;
656  double count;
657 
658  for (p=dataset.head(); p != 0; p=p->next())
659  {
660  leaf = tree.predict_node((*dataset(p)));
661  // do ols to get predict;
662  predict = 0.0;
663  real = dataset(p)->get_flt_val(wgn_predictee);
664  if (wgn_count_field == -1)
665  count = 1.0;
666  else
667  count = dataset(p)->get_flt_val(wgn_count_field);
668  x.cumulate(predict,count);
669  y.cumulate(real,count);
670  error = predict-real;
671  se.cumulate((error*error),count);
672  e.cumulate(fabs(error),count);
673  xx.cumulate(predict*predict,count);
674  yy.cumulate(real*real,count);
675  xy.cumulate(predict*real,count);
676  }
677 
678  // Pearson's product moment correlation coefficient
679 // cor = (xy.mean() - (x.mean()*y.mean()))/
680 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
681 // sqrt(yy.mean()-(y.mean()*y.mean())));
682  // Because when the variation is X is very small we can
683  // go negative, thus cause the sqrt's to give FPE
684  double v1 = xx.mean()-(x.mean()*x.mean());
685  double v2 = yy.mean()-(y.mean()*y.mean());
686 
687  double v3 = v1*v2;
688 
689  if (v3 <= 0)
690  // happens when there's very little variation in x
691  cor = 0;
692  else
693  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
694 
695  if (output != NULL)
696  {
697  if (output != &cout) // save in output file
698  *output
699  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
700  << " Correlation is " << ftoString(cor,4,1)
701  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
702  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
703 
704  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
705  << " Correlation is " << ftoString(cor,4,1)
706  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
707  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
708  }
709 
710  if (wgn_opt_param == "rmse")
711  return -sqrt(se.mean()); // * -1 so bigger is better
712  else
713  return cor; // should really be % variance, I think
714 }
715 
716 static int wagon_split(int margin, WNode &node)
717 {
718  // Split given node (if possible)
719  WQuestion q;
720  WNode *l,*r;
721 
722  node.set_impurity(WImpurity(node.get_data()));
723  q = find_best_question(node.get_data());
724 
725 /* printf("q.score() %f impurity %f\n",
726  q.get_score(),
727  node.get_impurity().measure()); */
728 
729  double impurity_measure = node.get_impurity().measure();
730  double question_score = q.get_score();
731 
732  if ((question_score < WGN_HUGE_VAL) &&
733  (question_score < impurity_measure))
734 
735  {
736  // Ok its worth a split
737  l = new WNode();
738  r = new WNode();
739  wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
740  node.set_subnodes(l,r);
741  node.set_question(q);
742  if (wgn_verbose)
743  {
744  int i;
745  for (i=0; i < margin; i++)
746  cout << " ";
747  cout << q << endl;
748  }
749  margin++;
750  wagon_split(margin,*l);
751  margin++;
752  wagon_split(margin,*r);
753  margin--;
754  return TRUE;
755  }
756  else
757  {
758  if (wgn_verbose)
759  {
760  int i;
761  for (i=0; i < margin; i++)
762  cout << " ";
763  cout << "stopped samples: " << node.samples() << " impurity: "
764  << node.get_impurity() << endl;
765  }
766  margin--;
767  return FALSE;
768  }
769 }
770 
771 void wgn_find_split(WQuestion &q,WVectorVector &ds,
773 {
774  int i, iy, in;
775 
776  y.resize(q.get_yes());
777  n.resize(q.get_no());
778 
779  for (iy=in=i=0; i < ds.n(); i++)
780  if (q.ask(*ds(i)) == TRUE)
781  y[iy++] = ds(i);
782  else
783  n[in++] = ds(i);
784 
785 }
786 
787 static WQuestion find_best_question(WVectorVector &dset)
788 {
789  // Ask all possible questions and find the best one
790  int i;
791  float bscore,tscore;
792  WQuestion test_ques, best_ques;
793 
794  bscore = tscore = WGN_HUGE_VAL;
795  best_ques.set_score(bscore);
796  // test each feature with each possible question
797  for (i=0;i < wgn_dataset.width(); i++)
798  {
799  if ((wgn_dataset.ignore(i) == TRUE) ||
800  (i == wgn_predictee))
801  tscore = WGN_HUGE_VAL; // ignore this feature this time
802  else if (wgn_dataset.ftype(i) == wndt_binary)
803  {
804  construct_binary_ques(i,test_ques);
805  tscore = wgn_score_question(test_ques,dset);
806  }
807  else if (wgn_dataset.ftype(i) == wndt_float)
808  {
809  tscore = construct_float_ques(i,test_ques,dset);
810  }
811  else if (wgn_dataset.ftype(i) == wndt_ignore)
812  tscore = WGN_HUGE_VAL; // always ignore this feature
813 #if 0
814  // This doesn't work reasonably
815  else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
816  {
817  wagon_error("subset selection temporarily deleted");
818  tscore = construct_class_ques_subset(i,test_ques,dset);
819  }
820 #endif
821  else if (wgn_dataset.ftype(i) >= wndt_class)
822  tscore = construct_class_ques(i,test_ques,dset);
823  if (tscore < bscore)
824  {
825  best_ques = test_ques;
826  best_ques.set_score(tscore);
827  bscore = tscore;
828  }
829  }
830 
831  return best_ques;
832 }
833 
834 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
835 {
836  // Find out which member of a class gives the best split
837  float tscore,bscore = WGN_HUGE_VAL;
838  int cl;
839  WQuestion test_q;
840 
841  test_q.set_fp(feat);
842  test_q.set_oper(wnop_is);
843  ques = test_q;
844 
845  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
846  {
847  test_q.set_operand1(EST_Val(cl));
848  tscore = wgn_score_question(test_q,ds);
849  if (tscore < bscore)
850  {
851  ques = test_q;
852  bscore = tscore;
853  }
854  }
855 
856  return bscore;
857 }
858 
859 #if 0
860 static float construct_class_ques_subset(int feat,WQuestion &ques,
861  WVectorVector &ds)
862 {
863  // Find out which subset of a class gives the best split.
864  // We first measure the subset of the data for each member of
865  // of the class. Then order those splits. Then go through finding
866  // where the best split of that ordered list is. This is described
867  // on page 247 of Breiman et al.
868  float tscore,bscore = WGN_HUGE_VAL;
869  LISP l;
870  int cl;
871 
872  ques.set_fp(feat);
873  ques.set_oper(wnop_is);
874  float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
875 
876  // Only do it for exists values
877  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
878  {
879  ques.set_operand(flocons(cl));
880  scores[cl] = wgn_score_question(ques,ds);
881  }
882 
883  LISP order = sort_class_scores(feat,scores);
884  if (order == NIL)
885  return WGN_HUGE_VAL;
886  if (siod_llength(order) == 1)
887  { // Only one so we know the best "split"
888  ques.set_oper(wnop_is);
889  ques.set_operand(car(order));
890  return scores[get_c_int(car(order))];
891  }
892 
893  ques.set_oper(wnop_in);
894  LISP best_l = NIL;
895  for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
896  {
897  ques.set_operand(l);
898  tscore = wgn_score_question(ques,ds);
899  if (tscore < bscore)
900  {
901  best_l = l;
902  bscore = tscore;
903  }
904 
905  }
906 
907  if (best_l != NIL)
908  {
909  if (siod_llength(best_l) == 1)
910  {
911  ques.set_oper(wnop_is);
912  ques.set_operand(car(best_l));
913  }
914  else if (equal(cdr(order),best_l) != NIL)
915  {
916  ques.set_oper(wnop_is);
917  ques.set_operand(car(order));
918  }
919  else
920  {
921  cout << "Found a good subset" << endl;
922  ques.set_operand(best_l);
923  }
924  }
925  return bscore;
926 }
927 
928 static LISP sort_class_scores(int feat,float *scores)
929 {
930  // returns sorted list of (non WGN_HUGE_VAL) items
931  int i;
932  LISP items = NIL;
933  LISP l;
934 
935  for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
936  {
937  if (scores[i] != WGN_HUGE_VAL)
938  {
939  if (items == NIL)
940  items = cons(flocons(i),NIL);
941  else
942  {
943  for (l=items; l != NIL; l=cdr(l))
944  {
945  if (scores[i] < scores[get_c_int(car(l))])
946  {
947  CDR(l) = cons(car(l),cdr(l));
948  CAR(l) = flocons(i);
949  break;
950  }
951  }
952  if (l == NIL)
953  items = l_append(items,cons(flocons(i),NIL));
954  }
955  }
956  }
957  return items;
958 }
959 #endif
960 
961 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
962 {
963  // Find out a split of the range that gives the best score
964  // Naively does this by partitioning the range into float_range_split slots
965  float tscore,bscore = WGN_HUGE_VAL;
966  int d, i;
967  float p;
968  WQuestion test_q;
969  float max,min,val,incr;
970 
971  test_q.set_fp(feat);
972  test_q.set_oper(wnop_lessthan);
973  ques = test_q;
974 
975  min = max = ds(0)->get_flt_val(feat); /* set up some value */
976  for (d=0; d < ds.n(); d++)
977  {
978  val = ds(d)->get_flt_val(feat);
979  if (val < min)
980  min = val;
981  else if (val > max)
982  max = val;
983  }
984  if (max == min) // we're pure
985  return WGN_HUGE_VAL;
986  incr = (max-min)/wgn_float_range_split;
987  // so do float_range-1 splits
988  /* We calculate this based on the number splits, not the increments, */
989  /* becuase incr can be so small it doesn't increment p */
990  for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
991  {
992  test_q.set_operand1(EST_Val(p));
993  tscore = wgn_score_question(test_q,ds);
994  if (tscore < bscore)
995  {
996  ques = test_q;
997  bscore = tscore;
998  }
999  }
1000 
1001  return bscore;
1002 }
1003 
1004 static void construct_binary_ques(int feat,WQuestion &test_ques)
1005 {
1006  // construct a question. Not sure about this in general
1007  // of course continuous/categorical features will require different
1008  // rule and non-binary ones will require some test point
1009 
1010  test_ques.set_fp(feat);
1011  test_ques.set_oper(wnop_binary);
1012  test_ques.set_operand1(EST_Val(""));
1013 }
1014 
1015 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1016 {
1017  // score this question as a possible split by finding
1018  // the sum of the impurities when ds is split with this question
1019  WImpurity y,n;
1020  int d, num_yes, num_no;
1021  float count;
1022  WVector *wv;
1023 
1024  num_yes = num_no = 0;
1025  y.data = &ds;
1026  n.data = &ds;
1027  for (d=0; d < ds.n(); d++)
1028  {
1029  if ((ignorenth < 2) ||
1030  (d%ignorenth != ignorenth-1))
1031  {
1032  wv = ds(d);
1033  if (wgn_count_field == -1)
1034  count = 1.0;
1035  else
1036  count = (*wv)[wgn_count_field];
1037 
1038  if (q.ask(*wv) == TRUE)
1039  {
1040  num_yes++;
1041  if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1042  y.cumulate(d,count); // note the sample number not value
1043  else
1044  y.cumulate((*wv)[wgn_predictee],count);
1045  }
1046  else
1047  {
1048  num_no++;
1049  if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1050  n.cumulate(d,count); // note the sample number not value
1051  else
1052  n.cumulate((*wv)[wgn_predictee],count);
1053  }
1054  }
1055  }
1056 
1057  q.set_yes(num_yes);
1058  q.set_no(num_no);
1059 
1060  int min_cluster;
1061 
1062  if ((wgn_balance == 0.0) ||
1063  (ds.n()/wgn_balance < wgn_min_cluster_size))
1064  min_cluster = wgn_min_cluster_size;
1065  else
1066  min_cluster = (int)(ds.n()/wgn_balance);
1067 
1068  if ((y.samples() < min_cluster) ||
1069  (n.samples() < min_cluster))
1070  return WGN_HUGE_VAL;
1071 
1072  float ym,nm,bm;
1073  // printf("awb_debug score_question_set X %f Y %f\n",
1074  // y.samples(), n.samples());
1075  ym = y.measure();
1076  nm = n.measure();
1077  bm = ym + nm;
1078 
1079  /* cout << q << endl;
1080  printf("test question y %f n %f b %f\n",
1081  ym, nm, bm); */
1082 
1083  return bm/2.0;
1084 }
1085 
1086 float wgn_score_question(WQuestion &q, WVectorVector &ds)
1087 {
1088  // This level of indirection was introduced for later expansion
1089 
1090  return score_question_set(q,ds,1);
1091 }
1092 
1093 WNode *wagon_stepwise(float limit)
1094 {
1095  // Find the best single features and incrementally add features
1096  // that best improve result until it doesn't improve.
1097  // This is basically to automate what Kurt was doing in building
1098  // trees, he then automated it in PERL and as it seemed to work
1099  // I put it into wagon itself.
1100  // This can be pretty computationally intensive.
1101  WNode *best = 0,*new_best = 0;
1102  float bscore,best_score = -WGN_HUGE_VAL;
1103  int best_feat,i;
1104  int nf = 1;
1105 
1106  // Set all features to ignore
1107  for (i=0; i < wgn_dataset.width(); i++)
1108  wgn_dataset.set_ignore(i,TRUE);
1109 
1110  for (i=0; i < wgn_dataset.width(); i++)
1111  {
1112  if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1113  {
1114  // This skips the round not because this has anything to
1115  // do with this feature being (user specified) ignored
1116  // but because it indicates there is one less cycle that is
1117  // necessary
1118  continue;
1119  }
1120  new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1121 
1122  if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1123  {
1124  // gone as far as we can
1125  delete new_best;
1126  break;
1127  }
1128  else
1129  {
1130  best_score = bscore;
1131  delete best;
1132  best = new_best;
1133  wgn_dataset.set_ignore(best_feat,FALSE);
1134  if (!wgn_quiet)
1135  {
1136  fprintf(stdout,"FEATURE %d %s: %2.4f\n",
1137  nf,
1138  (const char *)wgn_dataset.feat_name(best_feat),
1139  best_score);
1140  fflush(stdout);
1141  nf++;
1142  }
1143  }
1144  }
1145 
1146  return best;
1147 }
1148 
1149 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1150 {
1151  // Find which of the currently ignored features will best improve
1152  // the result
1153  WNode *best = 0;
1154  float best_score = -WGN_HUGE_VAL;
1155  int best_new_feat = -1;
1156  int i;
1157 
1158  for (i=0; i < wgn_dataset.width(); i++)
1159  {
1160  if (wgn_dataset.ftype(i) == wndt_ignore)
1161  continue; // user wants me to ignore this completely
1162  else if (i == wgn_predictee) // can't use the answer
1163  continue;
1164  else if (wgn_dataset.ignore(i) == TRUE)
1165  {
1166  WNode *current;
1167  float score;
1168 
1169  // Allow this feature to participate
1170  wgn_dataset.set_ignore(i,FALSE);
1171 
1172  current = wgn_build_tree(score);
1173 
1174  if (score > best_score)
1175  {
1176  best_score = score;
1177  delete best;
1178  best = current;
1179  best_new_feat = i;
1180 // fprintf(stdout,"BETTER FEATURE %d %s: %2.4f\n",
1181 // i,
1182 // (const char *)wgn_dataset.feat_name(i),
1183 // best_score);
1184 // fflush(stdout);
1185  }
1186  else
1187  delete current;
1188 
1189  // switch it off again
1190  wgn_dataset.set_ignore(i,TRUE);
1191  }
1192  }
1193 
1194  bscore = best_score;
1195  best_feat = best_new_feat;
1196  return best;
1197 }