Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
wagon_test_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* A program for testing a CART tree against data, also may be used to */
37 /* predict values using a tree and data */
38 /* */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include <iostream>
42 #include <fstream>
43 #include <cstring>
44 #include "EST_Wagon.h"
45 #include "EST_cutils.h"
46 #include "EST_multistats.h"
47 #include "EST_Token.h"
48 #include "EST_cmd_line.h"
49 
50 static int wagon_test_main(int argc, char **argv);
51 static LISP find_feature_value(const char *feature,
52  LISP vector, LISP description);
53 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description);
54 static LISP get_data_vector(EST_TokenStream &data, LISP description);
55 static void simple_predict(EST_TokenStream &data, FILE *output,
56  LISP tree, LISP description, int all_info);
57 static void test_tree_class(EST_TokenStream &data, FILE *output,
58  LISP tree, LISP description);
59 static void test_tree_float(EST_TokenStream &data, FILE *output,
60  LISP tree, LISP description);
61 
62 /** @name <command>wagon_test</command> <emphasis>Test CART models</emphasis>
63  @id wagon-test-manual
64  * @toc
65  */
66 
67 //@{
68 
69 
70 /**@name Synopsis
71  */
72 //@{
73 
74 //@synopsis
75 
76 /**
77 Wagon_test is used to test CART models on feature data.
78 
79 A detailed description of the CART model can be found in the
80 <link linkend="cart-overview">CART model overview</link> section.
81 */
82 
83 int main(int argc, char **argv)
84 {
85 
86  wagon_test_main(argc,argv);
87 
88  exit(0);
89  return 0;
90 }
91 
92 static int wagon_test_main(int argc, char **argv)
93 {
94  // Top level function sets up data and creates a tree
95  EST_Option al;
96  EST_StrList files;
97  LISP description,tree=NIL;;
98  EST_TokenStream data;
99  FILE *wgn_output;
100 
101  parse_command_line
102  (argc, argv,
103  EST_String("<options>\n")+
104  "Summary: program to test CART models on data\n"+
105  "-desc <ifile> Field description file\n"+
106  "-data <ifile> Datafile, one vector per line\n"+
107  "-tree <ifile> File containing CART tree\n"+
108  "-track <ifile>\n"+
109  " track for vertex indices\n"+
110  "-predict Predict for each vector returning full vector\n"+
111  "-predict_val Predict for each vector returning just value\n"+
112  "-predictee <string>\n"+
113  " name of field to predict (default is first field)\n"+
114  "-heap <int> {210000}\n"+
115  " Set size of Lisp heap, should not normally need\n"+
116  " to be changed from its default\n"+
117  "-o <ofile> File to save output in\n",
118  files, al);
119 
120  siod_init(al.ival("-heap"));
121 
122  if (al.present("-desc"))
123  {
124  gc_protect(&description);
125  description = car(vload(al.val("-desc"),1));
126  }
127  else
128  {
129  cerr << argv[0] << ": no description file specified" << endl;
130  exit(-1);
131  }
132 
133  if (al.present("-tree"))
134  {
135  gc_protect(&tree);
136  tree = car(vload(al.val("-tree"),1));
137  if (tree == NIL)
138  {
139  cerr << argv[0] << ": no tree found in \"" << al.val("-tree")
140  << "\"" << endl;
141  exit(-1);
142  }
143  }
144  else
145  {
146  cerr << argv[0] << ": no tree file specified" << endl;
147  exit(-1);
148  }
149 
150  if (al.present("-data"))
151  {
152  if (data.open(al.val("-data")) != 0)
153  {
154  cerr << argv[0] << ": can't open data file \"" <<
155  al.val("-data") << "\" for input." << endl;
156  exit(-1);
157  }
158  }
159  else
160  {
161  cerr << argv[0] << ": no data file specified" << endl;
162  exit(-1);
163  }
164 
165  if (al.present("-track"))
166  {
167  wgn_VertexTrack.load(al.val("-track"));
168  }
169 
170  if (al.present("-o"))
171  {
172  if ((wgn_output = fopen(al.val("-o"),"w")) == NULL)
173  {
174  cerr << argv[0] << ": can't open output file \"" <<
175  al.val("-o") << "\"" << endl;
176  }
177  }
178  else
179  wgn_output = stdout;
180 
181  if (al.present("-predictee"))
182  {
183  LISP l;
184  int i;
185  wgn_predictee_name = al.val("-predictee");
186  for (l=description,i=0; l != NIL; l=cdr(l),i++)
187  if (streq(wgn_predictee_name,get_c_string(car(car(l)))))
188  {
189  wgn_predictee = i;
190  break;
191  }
192  if (l==NIL)
193  {
194  cerr << argv[0] << ": predictee \"" << wgn_predictee <<
195  "\" not in description\n";
196  }
197  }
198  const char *predict_type =
199  get_c_string(car(cdr(siod_nth(wgn_predictee,description))));
200 
201  if (al.present("-predict"))
202  simple_predict(data,wgn_output,tree,description,FALSE);
203  else if (al.present("-predict_val"))
204  simple_predict(data,wgn_output,tree,description,TRUE);
205  else if (streq(predict_type,"float") ||
206  streq(predict_type,"int"))
207  test_tree_float(data,wgn_output,tree,description);
208 #if 0
209  else if (streq(predict_type,"vector"))
210  test_tree_vector(data,wgn_output,tree,description);
211 #endif
212  else
213  test_tree_class(data,wgn_output,tree,description);
214 
215  if (wgn_output != stdout)
216  fclose(wgn_output);
217  data.close();
218  return 0;
219 }
220 
221 static LISP get_data_vector(EST_TokenStream &data, LISP description)
222 {
223  // read in one vector. Should be terminated with an newline
224  LISP v=NIL,d;
225 
226  if (data.eof())
227  return NIL;
228 
229  for (d=description; d != NIL; d=cdr(d))
230  {
231  EST_Token t = data.get();
232 
233  if ((d != description) && (t.whitespace().contains("\n")))
234  {
235  cerr << "wagon_test: unexpected newline within vector " <<
236  t.row() << " wrong number of features" << endl;
237  siod_error();
238  }
239  if (streq(get_c_string(car(cdr(car(d)))),"float") ||
240  streq(get_c_string(car(cdr(car(d)))),"int"))
241  v = cons(flocons(atof(t.string())),v);
242  else if ((streq(get_c_string(car(cdr(car(d)))),"_other_")) &&
243  (siod_member_str(t.string(),cdr(car(d))) == NIL))
244  v = cons(strintern("_other_"),v);
245  else
246  v = cons(strintern(t.string()),v);
247  }
248 
249  return reverse(v);
250 }
251 
252 static void simple_predict(EST_TokenStream &data, FILE *output,
253  LISP tree, LISP description, int all_info)
254 {
255  LISP vector,predict;
256  EST_String val;
257 
258  for (vector=get_data_vector(data,description);
259  vector != NIL; vector=get_data_vector(data,description))
260  {
261  predict = wagon_vector_predict(tree,vector,description);
262  if (all_info)
263  val = siod_sprint(car(reverse(predict)));
264  else
265  val = siod_sprint(predict);
266  fprintf(output,"%s\n",(const char *)val);
267  }
268 }
269 
270 static void test_tree_float(EST_TokenStream &data, FILE *output,
271  LISP tree, LISP description)
272 {
273  // Test tree against data to get summary of results FLOAT
274  float predict_val,real_val;
275  EST_SuffStats x,y,xx,yy,xy,se,e;
276  double cor,error;
277  LISP vector,predict;
278 
279  for (vector=get_data_vector(data,description);
280  vector != NIL; vector=get_data_vector(data,description))
281  {
282  predict = wagon_vector_predict(tree,vector,description);
283  predict_val = get_c_float(car(reverse(predict)));
284  real_val = get_c_float(siod_nth(wgn_predictee,vector));
285  x += predict_val;
286  y += real_val;
287  error = predict_val-real_val;
288  se += error*error;
289  e += fabs(error);
290  xx += predict_val*predict_val;
291  yy += real_val*real_val;
292  xy += predict_val*real_val;
293  }
294 
295  cor = (xy.mean() - (x.mean()*y.mean()))/
296  (sqrt(xx.mean()-(x.mean()*x.mean())) *
297  sqrt(yy.mean()-(y.mean()*y.mean())));
298 
299  fprintf(output,";; RMSE %1.4f Correlation is %1.4f Mean (abs) Error %1.4f (%1.4f)\n",
300  sqrt(se.mean()),
301  cor,
302  e.mean(),
303  e.stddev());
304 }
305 
306 static void test_tree_class(EST_TokenStream &data, FILE *output,
307  LISP tree, LISP description)
308 {
309  // Test tree against class data to get summary of results
310  EST_StrStr_KVL pairs;
311  EST_StrList lex;
312  EST_String predict_class,real_class;
313  LISP vector,w,predict;
314  double H=0,Q=0,prob;
315  (void)output;
316 
317  for (vector=get_data_vector(data,description);
318  vector != NIL; vector=get_data_vector(data,description))
319  {
320  predict = wagon_vector_predict(tree,vector,description);
321  predict_class = get_c_string(car(reverse(predict)));
322  real_class = get_c_string(siod_nth(wgn_predictee,vector));
323  prob = get_c_float(car(cdr(siod_assoc_str(real_class,
324  predict))));
325  if (prob == 0)
326  H += log(0.000001);
327  else
328  H += log(prob);
329  Q ++;
330  pairs.add_item(real_class,predict_class,1);
331  }
332  for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
333  lex.append(get_c_string(car(w)));
334 
335  const EST_FMatrix &m = confusion(pairs,lex);
336  print_confusion(m,pairs,lex);
337  fprintf(stdout,";; entropy %g perplexity %g\n",
338  (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
339 }
340 
341 #if 0
342 static void test_tree_vector(EST_TokenStream &data, FILE *output,
343  LISP tree, LISP description)
344 {
345  // Test tree against class data to get summary of results
346  // Note we are talking about predicting vectors (a *bunch* of
347  // numbers, not just a single class here)
348  EST_StrStr_KVL pairs;
349  EST_StrList lex;
350  EST_String predict_class,real_class;
351  LISP vector,w,predict;
352  double H=0,Q=0,prob;
353  (void)output;
354 
355  for (vector=get_data_vector(data,description);
356  vector != NIL; vector=get_data_vector(data,description))
357  {
358  predict = wagon_vector_predict(tree,vector,description);
359  predict_class = get_c_string(car(reverse(predict)));
360  real_class = get_c_string(siod_nth(wgn_predictee,vector));
361  prob = get_c_float(car(cdr(siod_assoc_str(real_class,
362  predict))));
363  if (prob == 0)
364  H += log(0.000001);
365  else
366  H += log(prob);
367  Q ++;
368  pairs.add_item(real_class,predict_class,1);
369  }
370  for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
371  lex.append(get_c_string(car(w)));
372 
373  const EST_FMatrix &m = confusion(pairs,lex);
374  print_confusion(m,pairs,lex);
375  fprintf(stdout,";; entropy %g perplexity %g\n",
376  (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
377 }
378 #endif
379 
380 
381 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description)
382 {
383  // Using the LISP tree, vector and description, do standard prediction
384 
385  if (cdr(tree) == NIL)
386  return car(tree);
387 
388  LISP value = find_feature_value(wgn_ques_feature(car(tree)),
389  vector, description);
390 
391  if (wagon_ask_question(car(tree),value))
392  // Yes answer
393  return wagon_vector_predict(car(cdr(tree)),vector,description);
394  else
395  // No answer
396  return wagon_vector_predict(car(cdr(cdr(tree))),vector,description);
397 }
398 
399 static LISP find_feature_value(const char *feature,
400  LISP vector, LISP description)
401 {
402  LISP v,d;
403 
404  for (v=vector,d=description; v != NIL; v=cdr(v),d=cdr(d))
405  if (streq(feature,get_c_string(car(car(d)))))
406  return car(v);
407 
408  cerr << "wagon_test: can't find feature \"" << feature <<
409  "\" in description" << endl;
410  siod_error();
411  return NIL;
412 
413 }
414 
415 /** @name Testing trees
416 <para>
417 Decision trees generated by wagon (or otherwise) can be applied
418 to and tested against data sets using this program. This program
419 requires a data set which is in the same format as wagon (and
420 other programs) requires. It also needs a dataset description
421 file naming the fields and given their type (see
422 <link linkend="wagon-manual">the wagon manual</link> for a description
423 for the actual format.
424 <screen>
425 wagon_test -data feats.data -desc feats.desc -tree feats.tree
426 </screen>
427 This will simply uses the tree against each sample in the data
428 file and compare the predicted value with the actual value and
429 produce a summary of the result. For categorial predictees a
430 percentage correct and confusion matrix is generated. For continuous
431 values the root mean squared error (RMSE) and correlation between the
432 predicted values and the actual values is given.
433 </para><para>
434 By default the predictee is the first field but may also be specified
435 on the command line. The dataset may contain features which are not
436 used by the tree.
437 </para><para>
438 This program can also be used to generate output values for sampled
439 data. In this case the sample data must still contain a "value" for
440 the predictee even if it is dummy. The option
441 <command>-predict</command> will output the new sample vector with
442 the predicted value in place, and the option
443 <command>-predict_val</command> option will just output the value.
444 </para><para>
445 This program is specifically designed for testing purposes although it
446 can also just be used for prediction. It is probably more efficient
447 to use the Lisp function <command>wagon</command> or underlying
448 C++ function <command>wagon_predict()</command>.
449 
450 */
451 
452 //@}