Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_DProbDist.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : July 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Simple statistics (for discrete probability distributions */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <fstream>
42 #include <cstdlib>
43 #include <cstdio>
44 #include <cstring>
45 #include "EST_String.h"
46 #include "EST_TKVL.h"
47 #include "EST_simplestats.h"
48 
49 /* We share ints and pointers for two types of probability distributions */
50 /* The know discrete sets can be indexed by ints which is *much* faster */
51 /* the indices pass around a pointers but the lower part contain ints in */
52 /* the discrete case */
53 /* On 64bit architectures this is a issue so we need have some macros */
54 /* to help us here. */
55 
56 const int est_64to32(void *c)
57 { /* this returns the bottom end of the pointer as an unsigned int */
58  /* I believe this is a safe way to do it, we check the bits in the */
59  /* 64 bit int and multiply them out in the 32 bit one */
60  /* there might be better ways, but I think you'd need to think about */
61  /* byte order then */
62  long long l;
63  int d;
64  int i,x;
65 
66  l = (long long)c;
67 
68  for (i=0,d=0,x=1; i<24; i++)
69  {
70  if (l & 1)
71  d += x;
72  l = l >> 1;
73  x += x;
74  }
75 
76  return d;
77 }
78 /* #define tprob_int(X) ((sizeof(void *) != 8) ? est_64to32(X) : (int)X) */
79 #define tprob_int(X) (est_64to32(X))
80 
81 
82 EST_DiscreteProbDistribution::EST_DiscreteProbDistribution(const EST_Discrete *d,
83  const double n_samples, const EST_DVector &counts)
84 {
85  type = tprob_discrete;
86  discrete = d;
87  num_samples = n_samples;
88 
89  icounts = counts;
90 }
91 
92 EST_DiscreteProbDistribution::EST_DiscreteProbDistribution(const EST_DiscreteProbDistribution &b)
93 {
94  copy(b);
95 }
96 
98 {
99  type = b.type;
100  num_samples = b.num_samples;
101  discrete = b.discrete;
102  icounts = b.icounts;
103  scounts = b.scounts;
104 }
105 
107 {
108  icounts.resize(0);
109 }
110 
112 {
113  type = tprob_string;
114  num_samples = 0;
115  discrete = 0;
116 }
117 
119 {
120  int i;
121  clear();
122  type = tprob_discrete;
123  num_samples = 0;
124  discrete = new EST_Discrete(vocab);
125 
126  icounts.resize(vocab.length());
127  for (i=0; i<icounts.length(); i++)
128  icounts.a_no_check(i) = 0;
129 
130  return true;
131 }
132 
134 {
135  int i;
136  clear();
137  type = tprob_discrete;
138  num_samples = 0;
139  discrete = d;
140  icounts.resize(d->length());
141  for (i=0; i<icounts.length(); i++)
142  icounts.a_no_check(i) = 0;
143 }
144 
146 {
147  icounts[tprob_int(i)] += count;
148  num_samples += count;
149 }
150 
151 void EST_DiscreteProbDistribution::cumulate(int i,double count)
152 {
153  icounts[i] += count;
154  num_samples += count;
155 }
156 
158 {
159  EST_Litem *p;
160 
161  if (type == tprob_discrete)
162  {
163  int idx = discrete->index(s);
164  icounts[idx] += count;
165  }
166  else // its a (slow) string type
167  {
168  for (p=scounts.list.head(); p != 0; p=p->next())
169  {
170  if (scounts.list(p).k == s)
171  {
172  scounts.list(p).v += count;
173  break;
174  }
175  }
176  if (p == 0) // first occurrence
177  scounts.add_item(s,count,1); // add without search
178  }
179  num_samples += count;
180 }
181 
183 {
184  EST_Litem *p,*t;
185  double max = 0;
186 
187  if (type == tprob_discrete)
188  {
189  int i,pt=-1;
190  for (i=0; i < icounts.length(); i++)
191  if (icounts.a_no_check(i) > max)
192  {
193  pt = i;
194  max = icounts.a_no_check(i);
195  }
196  if (max == 0)
197  {
198  if(prob != NULL)
199  *prob = 0.0;
200  return EST_String::Empty;
201  }
202  else
203  {
204  if(prob != NULL)
205  *prob = probability(pt);
206  return discrete->name(pt);
207  }
208  }
209  else
210  {
211  t = 0;
212  for (p=scounts.list.head(); p != 0; p=p->next())
213  if (scounts.list(p).v > max)
214  {
215  t = p;
216  max = scounts.list(p).v;
217  }
218  if (max == 0)
219  {
220  if(prob != NULL)
221  *prob = 0.0;
222  return EST_String::Empty;
223  }
224  else
225  {
226  if(prob != NULL)
227  *prob = (double)max/num_samples;
228  return scounts.list(t).k;
229  }
230  }
231 }
232 
233 double EST_DiscreteProbDistribution::probability(const EST_String &s) const
234 {
235  if (frequency(s) == 0.0)
236  return 0.0;
237  else
238  return (double)frequency(s)/num_samples;
239 }
240 
241 double EST_DiscreteProbDistribution::probability(const int i) const
242 {
243  if (frequency(i) == 0.0)
244  return 0.0;
245  else
246  return (double)frequency(i)/num_samples;
247 }
248 
249 double EST_DiscreteProbDistribution::frequency(const EST_String &s) const
250 {
251  if (type == tprob_discrete)
252  return icounts.a_no_check(discrete->index(s));
253  else
254  return scounts.val_def(s,0);
255 }
256 
257 double EST_DiscreteProbDistribution::frequency(const int i) const
258 {
259  if (type == tprob_discrete)
260  return icounts(i);
261  else
262  {
263  cerr << "ProbDistribution: can't access string type pd with int\n";
264  return 0;
265  }
266 }
267 
269 {
270  if (type == tprob_discrete)
271  {
272  num_samples -= icounts.a_no_check(discrete->index(s));
273  num_samples += c;
274  icounts.a_no_check(discrete->index(s)) = c;
275  }
276  else
277  {
278  num_samples -= scounts.val_def(s,0);
279  num_samples += c;
280  scounts.add_item(s,c);
281  }
282 }
283 
285 {
286  if (type == tprob_discrete)
287  {
288  num_samples -= icounts[i];
289  num_samples += c;
290  icounts[i] = c;
291  }
292  else
293  {
294  cerr << "ProbDistribution: can't access string type pd with int\n";
295  }
296 
297 }
298 
300 {
301  if (type == tprob_discrete)
302  {
303  num_samples -= icounts[tprob_int(i)];
304  num_samples += c;
305  icounts[tprob_int(i)] = c;
306  }
307  else
308  {
309  cerr << "ProbDistribution: can't access string type pd with int\n";
310  }
311 
312 }
313 
314 
316 {
317  if (type == tprob_discrete)
318  icounts.a_no_check(discrete->index(s)) = c;
319  else
320  scounts.add_item(s,c);
321 }
322 
324 {
325  if (type == tprob_discrete)
326  icounts[i] = c;
327  else
328  cerr << "ProbDistribution: can't access string type pd with int\n";
329 }
330 
332 {
333  if (type == tprob_discrete)
334  icounts[tprob_int(i)] = c;
335  else
336  cerr << "ProbDistribution: can't access string type pd with int\n";
337 }
338 
340 {
341  // Returns the entropy of the current distribution
342  double e=0.0;
343  EST_Litem *p;
344  int i;
345 
346  if (type == tprob_discrete)
347  {
348  for (i=0; i < icounts.length(); i++)
349  {
350  double prob = icounts.a_no_check(i)/num_samples;
351  if (prob != 0.0)
352  e += prob * log(prob); /* log10(prob)/log10(2) */
353  }
354  }
355  else
356  {
357  for (p=scounts.list.head(); p != 0; p=p->next())
358  {
359  double prob = scounts.list(p).v/num_samples;
360  if (prob != 0.0)
361  e += prob * log(prob); /* log10(prob)/log10(2) */
362  }
363  }
364 
365  return -e;
366 
367 }
368 
369 // For iterating through members of a probability distribution
371 {
372  if (type == tprob_discrete)
373  return NULL;
374  else
375  return scounts.list.head();
376 }
377 
379 {
380  if (type == tprob_discrete)
381  return (tprob_int(idx) >= icounts.length());
382  else
383  return (idx == 0);
384 }
385 
387 {
388  if (type == tprob_discrete)
389  return (EST_Litem *)(((unsigned char *)idx)+1);
390  else
391  return idx->next();
392 }
393 
395 {
396  if (type == tprob_discrete)
397  return discrete->name(tprob_int(idx));
398  else
399  return scounts.list(idx).k;
400 }
401 
403 {
404  if (type == tprob_discrete)
405  {
406  s = discrete->name(tprob_int(idx));
407  freq = icounts(tprob_int(idx));
408  }
409  else
410  {
411  s = scounts.list(idx).k;
412  freq = scounts.list(idx).v;
413  }
414 }
415 
417 {
418  if (type == tprob_discrete)
419  {
420  prob = probability(tprob_int(idx));
421  s = discrete->name(tprob_int(idx));
422  }
423  else
424  {
425  s = scounts.list(idx).k;
426  prob = (double)scounts.list(idx).v/num_samples;
427  }
428 }
429 
430 ostream & operator<<(ostream &s, const EST_DiscreteProbDistribution &pd)
431 {
432  // Output best with probabilities
433  EST_Litem *i;
434  double prob;
435  double sum=0;
436  EST_String name;
437 
438  s << "(";
439  for (i=pd.item_start(); !pd.item_end(i); i=pd.item_next(i))
440  {
441  pd.item_prob(i,name,prob);
442  s << "(" << name << "=" << prob << ") ";
443  sum+=prob;
444  }
445  s << "best=" << pd.most_probable(&prob) << " samples="
446  << pd.samples() << " sum=" << sum << ")";
447  return s;
448 }
449 
450 EST_DiscreteProbDistribution &EST_DiscreteProbDistribution::operator=(const EST_DiscreteProbDistribution &a)
451 {
452  // I'd much rather this was never called
453  copy(a);
454  return *this;
455 }
456