Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
dlist.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Decision lists */
37 /* These are like a right branching decision tree, yes answers are not */
38 /* partitioned any further. These are used for the Yarowsky-style */
39 /* homograph disambiguators */
40 /* */
41 /* Yarowsky, D. "Homograph Disambiguation in Text-to_Speech Synthesis" */
42 /* in "Progress in Speech Synthesis" eds van Santen, J. et al. Springer */
43 /* pp 157-172, 1996 */
44 /* Rivest, R.L. "Learning decision lists", Machine Learning 2:229-246 */
45 /* 1987 */
46 /* */
47 /*=======================================================================*/
48 #include <iostream>
49 #include <cstring>
50 #include "EST_Wagon.h"
51 #include "EST_FMatrix.h"
52 #include "EST_multistats.h"
53 
54 static WDlist *wagon_decision_list();
55 static void do_dlist_summary(WDlist *dlist, WDataSet &dataset);
56 static WDlist *dlist_score_question(WQuestion &q, WVectorList &ds);
57 
58 WNode *wgn_build_dlist(float &score,ostream *output)
59 {
60  WDlist *dlist;
61 
62  dlist = wagon_decision_list();
63 
64  *output << *dlist;
65 
66  if (wgn_test_dataset.width() > 0) // load in test data
67  do_dlist_summary(dlist,wgn_test_dataset);
68  else
69  do_dlist_summary(dlist,wgn_dataset);
70 
71  score = 0;
72  return 0;
73 }
74 
75 static void do_dlist_summary(WDlist *dlist, WDataSet &dataset)
76 {
77  // Test dlist against data to get summary of results
78  EST_StrStr_KVL pairs;
79  EST_StrList lex;
80  EST_Litem *p;
81  EST_String predict,real;
82  int i,type;
83 
84  for (p=dataset.head(); p != 0; p=p->next())
85  {
86  predict = (EST_String)dlist->predict(*dataset(p));
87  type = dataset.ftype(0);
88  real = wgn_discretes[type].name(dataset(p)->get_int_val(0));
89  pairs.add_item(real,predict,1);
90  }
91  for (i=0; i<wgn_discretes[dataset.ftype(0)].length(); i++)
92  lex.append(wgn_discretes[dataset.ftype(0)].name(i));
93 
94  const EST_FMatrix &m = confusion(pairs,lex);
95  print_confusion(m,pairs,lex);
96 
97 }
98 
99 static WDlist *wagon_decision_list()
100 {
101  // For each possible question, measure it using
102  // yarowsky's loglikelihood ratio
103  // Abs(log(P(class_1|question_j)/P(class_2|question_j)))
104  // Output it for external sorting
105  // Hmm this implies binary classes
106  int i,cl;
107  WQuestion ques;
108  WDlist *dlist, *d;
109 
110  dlist=0;
111  for (i=1;i < wgn_dataset.width(); i++)
112  {
113  if (wgn_dataset.ftype(i) == wndt_ignore)
114  continue;
115  else if (wgn_dataset.ftype(i) >= wndt_class)
116  {
117  ques.set_fp(i);
118  ques.set_oper(wnop_is);
119  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(i)].length(); cl++)
120  {
121  ques.set_operand1(EST_Val(cl));
122  d = dlist_score_question(ques,wgn_dataset);
123  if (d != 0)
124  dlist = add_to_dlist(dlist,d);
125  }
126  }
127  }
128 
129  return dlist;
130 }
131 
132 static WDlist *dlist_score_question(WQuestion &q, WVectorList &ds)
133 {
134  // score this question for decision lists
135  // the sum of the impurities when ds is split with this question
136  WImpurity y;
137  EST_Litem *d;
138  WVector *wv;
139  int i;
140 
141  for (i=0,d=ds.head(); d != 0; d=d->next(),i++)
142  {
143  wv = ds(d);
144  if (q.ask(*wv) == TRUE)
145  y.cumulate((*wv)[0]);
146  }
147 
148  if (y.samples() > wgn_min_cluster_size)
149  {
150  q.set_yes((int)y.samples());
151 
152  EST_DiscreteProbDistribution &pd = y.pd();
153 
154  // Generalizing the formula in Yarowsky (pp157-172) we modify it
155  // to take absolute log-likelihood ration of the most probability
156  // of the most probable over the rest
157  EST_String t;
158  double p;
159  double f;
160  WDlist *n = new WDlist;
161  n->set_question(q);
162 
163  t = pd.most_probable(&p);
164  f = pd.frequency(t);
165  n->set_score(fabs(log((f+0.0001)/(pd.samples()-f+0.0001))));
166  n->set_best(t,(int)f,(int)pd.samples());
167 
168 #if 0 // original two-case code
169  int freqa, freqb;
170  pd.item_freq(0,s,freqa);
171  pd.item_freq(1,s,freqb);
172 
173  n->set_score(fabs(log((0.0001+freqa)/(0.0001+freqb))));
174  n->set_freqs(freqa,freqb);
175 #endif
176  return n;
177  }
178 
179  return 0;
180 }
181 
182 EST_Val WDlist::predict(const WVector &d)
183 {
184  if (p_question.ask(d))
185  return p_token;
186  else if (next == 0)
187  return "guess"; // should be a priori most probable as dlist can't help
188  else
189  return next->predict(d);
190 }
191 
192 WDlist *add_to_dlist(WDlist *l, WDlist *a)
193 {
194  // Add a to l at appropriate place in score order
195  WDlist *p,*lp;
196 
197  if (l == 0)
198  return a;
199  else
200  {
201  for (lp=0,p=l; p != 0; lp=p,p=p->next)
202  {
203  if (a->score() > p->score())
204  {
205  a->next = p;
206  if (lp == 0)
207  return a;
208  else
209  {
210  lp->next = a;
211  return l;
212  }
213  }
214  }
215  // add to end
216  lp->next = a;
217  }
218  return l;
219 }
220 
221 ostream &operator <<(ostream &s, WDlist &dlist)
222 {
223  s << endl;
224  s << "(";
225  s << dlist.p_question;
226  s << " ((";
227  s << dlist.p_score;
228  s << " " << dlist.p_freq << " " << dlist.p_samples <<
229  " " << dlist.p_token << "))";
230  if (dlist.next != 0)
231  s << *dlist.next;
232  else
233  s << endl;
234  s << ")";
235 
236  return s;
237 }