Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
wagon_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996-2006 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* */
42 /* Added vector support for Clustergen 2005/2006 */
43 /* */
44 /*=======================================================================*/
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Wagon.h"
50 #include "EST_cmd_line.h"
51 
52 enum wn_strategy_type {wn_decision_list, wn_decision_tree};
53 
54 static wn_strategy_type wagon_type = wn_decision_tree;
55 
56 static int wagon_main(int argc, char **argv);
57 
58 /** @name <command>wagon</command> <emphasis>CART building program</emphasis>
59  @id wagon_manual
60  * @toc
61  */
62 
63 //@{
64 
65 
66 /**@name Synopsis
67  */
68 //@{
69 
70 //@synopsis
71 
72 /**
73 wagon is used to build CART tress from feature data, its basic
74 features include:
75 
76 <itemizedlist>
77 <listitem><para>both decisions trees and decision lists are supported</para></listitem>
78 <listitem><para>predictees can be discrete or continuous</para></listitem>
79 <listitem><para>input features may be discrete or continuous</para></listitem>
80 <listitem><para>many options for controlling tree building</para>
81 <itemizedlist>
82 <listitem><para>fixed stop value</para></listitem>
83 <listitem><para>balancing</para></listitem>
84 <listitem><para>held-out data and pruning</para></listitem>
85 <listitem><para>stepwise use of input features</para></listitem>
86 <listitem><para>choice of optimization criteria correct/entropy (for
87 classification and rmse/correlation (for regression)</para></listitem>
88 </itemizedlist>
89 </listitem>
90 </itemizedlist>
91 
92 A detailed description of building CART models can be found in the
93 <link linkend="cart-overview">CART model overview</link> section.
94 
95 */
96 
97 //@}
98 
99 /**@name OPTIONS
100  */
101 //@{
102 
103 //@options
104 
105 //@}
106 
107 int main(int argc, char **argv)
108 {
109 
110  wagon_main(argc,argv);
111 
112  exit(0);
113  return 0;
114 }
115 
116 static int set_Vertex_Feats(EST_Track &wgn_VertexFeats,
117  EST_String &wagon_track_features)
118 {
119  int i,s=0,e;
120  EST_TokenStream ts;
121 
122  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
123  wgn_VertexFeats.a(0,i) = 0.0;
124 
125  ts.open_string(wagon_track_features);
126  ts.set_WhiteSpaceChars(",- ");
127  ts.set_PunctuationSymbols("");
129  ts.set_SingleCharSymbols("");
130 
131  while (!ts.eof())
132  {
133  EST_Token &token = ts.get();
134  const EST_String ws = (const char *)token.whitespace();
135  if (token == "all")
136  {
137  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
138  wgn_VertexFeats.a(0,i) = 1.0;
139  break;
140  } else if ((ws == ",") || (ws == ""))
141  {
142  s = atoi(token.string());
143  wgn_VertexFeats.a(0,s) = 1.0;
144  } else if (ws == "-")
145  {
146  if (token == "")
147  e = wgn_VertexFeats.num_channels()-1;
148  else
149  e = atoi(token.string());
150  for (i=s; i<=e && i<wgn_VertexFeats.num_channels(); i++)
151  wgn_VertexFeats.a(0,i) = 1.0;
152  } else
153  {
154  printf("wagon: track_feats invalid: %s at position %d\n",
155  (const char *)wagon_track_features,
156  ts.filepos());
157  exit(-1);
158  }
159  }
160 
161  return 0;
162 }
163 
164 static int wagon_main(int argc, char **argv)
165 {
166  // Top level function sets up data and creates a tree
167  EST_Option al;
168  EST_StrList files;
169  EST_String wgn_oname;
170  ostream *wgn_coutput = 0;
171  float stepwise_limit = 0;
172  int feats_start=0, feats_end=0;
173  int i;
174 
175  parse_command_line
176  (argc, argv,
177  EST_String("[options]\n") +
178  "Summary: CART building program\n"+
179  "-desc <ifile> Field description file\n"+
180  "-data <ifile> Datafile, one vector per line\n"+
181  "-stop <int> {50} Minimum number of examples for leaf nodes\n"+
182  "-test <ifile> Datafile to test tree on\n"+
183  "-frs <float> {10} Float range split, number of partitions to\n"+
184  " split a float feature range into\n"+
185  "-dlist Build a decision list (rather than tree)\n"+
186  "-dtree Build a decision tree (rather than list) default\n"+
187  "-output <ofile> \n"+
188  "-o <ofile> File to save output tree in\n"+
189  "-distmatrix <ifile>\n"+
190  " A distance matrix for clustering\n"+
191  "-track <ifile>\n"+
192  " track for vertex indices\n"+
193  "-track_start <int>\n"+
194  " start channel vertex indices\n"+
195  "-track_end <int>\n"+
196  " end (inclusive) channel for vertex indices\n"+
197  "-track_feats <string>\n"+
198  " Track features to use, comma separated list\n"+
199  " with feature numbers and/or ranges, 0 start\n"+
200  "-unittrack <ifile>\n"+
201  " track for unit start and length in vertex track\n"+
202  "-quiet No questions printed during building\n"+
203  "-verbose Lost of information printing during build\n"+
204  "-predictee <string>\n"+
205  " name of field to predict (default is first field)\n"+
206  "-ignore <string>\n"+
207  " Filename or bracket list of fields to ignore\n"+
208  "-count_field <string>\n"+
209  " Name of field containing count weight for samples\n"+
210  "-stepwise Incrementally find best features\n"+
211  "-swlimit <float> {0.0}\n"+
212  " Percentage necessary improvement for stepwise,\n"+
213  " may be negative.\n"+
214  "-swopt <string> Parameter to optimize for stepwise, for \n"+
215  " classification options are correct or entropy\n"+
216  " for regression options are rmse or correlation\n"+
217  " correct and correlation are the defaults\n"+
218  "-balance <float> For derived stop size, if dataset at node, divided\n"+
219  " by balance is greater than stop it is used as stop\n"+
220  " if balance is 0 (default) always use stop as is.\n"+
221  "-vertex_output <string> Output <mean> or <best> of cluster\n"+
222  "-held_out <int> Percent to hold out for pruning\n"+
223  "-heap <int> {210000}\n"+
224  " Set size of Lisp heap, should not normally need\n"+
225  " to be changed from its default, only with *very*\n"+
226  " large description files (> 1M)\n"+
227  "-noprune No (same class) pruning required\n",
228  files, al);
229 
230  if (al.present("-held_out"))
231  wgn_held_out = al.ival("-held_out");
232  if (al.present("-balance"))
233  wgn_balance = al.fval("-balance");
234  if ((!al.present("-desc")) || ((!al.present("-data"))))
235  {
236  cerr << argv[0] << ": missing description and/or datafile" << endl;
237  cerr << "use -h for description of arguments" << endl;
238  }
239 
240  if (al.present("-quiet"))
241  wgn_quiet = TRUE;
242  if (al.present("-verbose"))
243  wgn_verbose = TRUE;
244 
245  if (al.present("-stop"))
246  wgn_min_cluster_size = atoi(al.val("-stop"));
247  if (al.present("-noprune"))
248  wgn_prune = FALSE;
249  if (al.present("-predictee"))
250  wgn_predictee_name = al.val("-predictee");
251  if (al.present("-count_field"))
252  wgn_count_field_name = al.val("-count_field");
253  if (al.present("-swlimit"))
254  stepwise_limit = al.fval("-swlimit");
255  if (al.present("-frs")) // number of partitions to try in floats
256  wgn_float_range_split = atof(al.val("-frs"));
257  if (al.present("-swopt"))
258  wgn_opt_param = al.val("-swopt");
259  if (al.present("-vertex_output"))
260  wgn_vertex_output = al.val("-vertex_output");
261  if (al.present("-output") || al.present("-o"))
262  {
263  if (al.present("-o"))
264  wgn_oname = al.val("-o");
265  else
266  wgn_oname = al.val("-output");
267  wgn_coutput = new ofstream(wgn_oname);
268  if (!(*wgn_coutput))
269  {
270  cerr << "Wagon: can't open file \"" << wgn_oname <<
271  "\" for output " << endl;
272  exit(-1);
273  }
274  }
275  else
276  wgn_coutput = &cout;
277  if (al.present("-distmatrix"))
278  {
279  if (wgn_DistMatrix.load(al.val("-distmatrix")) != 0)
280  {
281  cerr << "Wagon: failed to load Distance Matrix from \"" <<
282  al.val("-distmatrix") << "\"\n" << endl;
283  exit(-1);
284  }
285  }
286  if (al.present("-dlist"))
287  wagon_type = wn_decision_list;
288 
289  WNode *tree;
290  float score;
291  LISP ignores = NIL;
292 
293  siod_init(al.ival("-heap"));
294 
295  if (al.present("-ignore"))
296  {
297  EST_String ig = al.val("-ignore");
298  if (ig[0] == '(')
299  ignores = read_from_string(ig);
300  else
301  ignores = vload(ig,1);
302  }
303  // Load in the data
304  wgn_load_datadescription(al.val("-desc"),ignores);
305  wgn_load_dataset(wgn_dataset,al.val("-data"));
306  if (al.present("-distmatrix") &&
307  (wgn_DistMatrix.num_rows() < wgn_dataset.length()))
308  {
309  cerr << "wagon: distance matrix is smaller than number of training elements\n";
310  exit(-1);
311  }
312  else if (al.present("-track"))
313  {
314  wgn_VertexTrack.load(al.val("-track"));
315  wgn_VertexFeats.resize(1,wgn_VertexTrack.num_channels());
316  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
317  wgn_VertexFeats.a(0,i) = 1.0;
318  }
319 
320  if (al.present("-track_start"))
321  {
322  feats_start = al.ival("-track_start");
323  if ((feats_start < 0) ||
324  (feats_start > wgn_VertexTrack.num_channels()))
325  {
326  printf("wagon: track_start invalid: %d out of %d channels\n",
327  feats_start,
328  wgn_VertexTrack.num_channels());
329  exit(-1);
330  }
331  for (i=0; i<feats_start; i++)
332  wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats up to start */
333 
334  }
335 
336  if (al.present("-track_end"))
337  {
338  feats_end = al.ival("-track_end");
339  if ((feats_end < feats_start) ||
340  (feats_end > wgn_VertexTrack.num_channels()))
341  {
342  printf("wagon: track_end invalid: %d between start %d out of %d channels\n",
343  feats_end,
344  feats_start,
345  wgn_VertexTrack.num_channels());
346  exit(-1);
347  }
348  for (i=feats_end+1; i<wgn_VertexTrack.num_channels(); i++)
349  wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats after end */
350  }
351  if (al.present("-track_feats"))
352  { /* overrides start and end numbers */
353  EST_String wagon_track_features = al.val("-track_feats");
354  set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
355  }
356 
357  // printf("Track feats\n");
358  // for (i=0; i<wgn_VertexTrack.num_channels(); i++)
359  // if (wgn_VertexFeats.a(0,i) > 0.0)
360  // printf("%d ",i);
361  // printf("\n");
362 
363  if (al.present("-unittrack"))
364  { /* contains two features, a start and length. start indexes */
365  /* into VertexTrack to the first vector in the segment */
366  wgn_UnitTrack.load(al.val("-unittrack"));
367  }
368 
369  if (al.present("-test"))
370  wgn_load_dataset(wgn_test_dataset,al.val("-test"));
371 
372  // Build and test the model
373  if (al.present("-stepwise"))
374  tree = wagon_stepwise(stepwise_limit);
375  else if (wagon_type == wn_decision_tree)
376  tree = wgn_build_tree(score); // default operation
377  else if (wagon_type == wn_decision_list)
378  // dlist is printed with build_dlist rather than returned
379  tree = wgn_build_dlist(score,wgn_coutput);
380  else
381  {
382  cerr << "Wagon: unknown operation, not tree or list" << endl;
383  exit(-1);
384  }
385 
386  if (tree != 0)
387  {
388  *wgn_coutput << *tree;
389  summary_results(*tree,wgn_coutput);
390  }
391 
392  if (wgn_coutput != &cout)
393  delete wgn_coutput;
394  return 0;
395 }
396 
397 /** @name Building Trees
398 
399 To build a decision tree (or list) Wagon requires data and a description
400 of it. A data file consists a set of samples, one per line each
401 consisting of the same set of features. Features may be categorial
402 or continuous. By default the first feature is the predictee and the
403 others are used as predictors. A typical data file will look like
404 this
405 </para>
406 <para>
407 <screen>
408 0.399 pau sh 0 0 0 1 1 0 0 0 0 0 0
409 0.082 sh iy pau onset 0 1 0 0 1 1 0 0 1
410 0.074 iy hh sh coda 1 0 1 0 1 1 0 0 1
411 0.048 hh ae iy onset 0 1 0 1 1 1 0 1 1
412 0.062 ae d hh coda 1 0 0 1 1 1 0 1 1
413 0.020 d y ae coda 2 0 1 1 1 1 0 1 1
414 0.082 y ax d onset 0 1 0 1 1 1 1 1 1
415 0.082 ax r y coda 1 0 0 1 1 1 1 1 1
416 0.036 r d ax coda 2 0 1 1 1 1 1 1 1
417 ...
418 </screen>
419 </para>
420 <para>
421 The data may come from any source, such as the festival script
422 dumpfeats which allows the creation of such files easily from utterance
423 files.
424 </para><para>
425 In addition to a data file a description file is also require that
426 gives a name and a type to each of the features in the datafile.
427 For the above example it would look like
428 </para><para>
429 <screen>
430 ((segment_duration float)
431  ( name aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
432  hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
433  ( n.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
434  hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
435  ( p.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
436  hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
437  (position_type 0 onset coda)
438  (pos_in_syl float)
439  (syl_initial 0 1)
440  (syl_final 0 1)
441  (R:Sylstructure.parent.R:Syllable.p.syl_break float)
442  (R:Sylstructure.parent.syl_break float)
443  (R:Sylstructure.parent.R:Syllable.n.syl_break float)
444  (R:Sylstructure.parent.R:Syllable.p.stress 0 1)
445  (R:Sylstructure.parent.stress 0 1)
446  (R:Sylstructure.parent.R:Syllable.n.stress 0 1)
447 )
448 </screen>
449 </para><para>
450 The feature names are arbitrary, but as they appear in the generated
451 trees is most useful if the trees are to be used in prediction of
452 an utterance that the names are features and/or pathnames.
453 </para><para>
454 Wagon can be used to build a tree with such files with the command
455 <screen>
456 wagon -data feats.data -desc fest.desc -stop 10 -output feats.tree
457 </screen>
458 A test data set may also be given which must match the given data description.
459 If specified the built tree will be tested on the test set and results
460 on that will be presented on completion, without a test set the
461 results are given with respect to the training data. However in
462 stepwise case the test set is used in the multi-level training process
463 thus it cannot be considered as true test data and more reasonable
464 results should found on applying the generate tree to truly
465 held out data (via the program wagon_test).
466 
467 */
468 
469 //@}