Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
ngrammar_io.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* IO functions for EST_Ngram class */
38 /* */
39 /*=======================================================================*/
40 
41 #include <cstdlib>
42 #include <fstream>
43 #include <iostream>
44 #include "EST_unix.h"
45 #include <cstring>
46 #include <climits>
47 #include <cfloat>
48 #include "EST_String.h"
49 #include "EST_Ngrammar.h"
50 #include "EST_Token.h"
51 #include "EST_cutils.h"
52 
53 EST_read_status
54 load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
55 {
56  (void)filename;
57  (void)n;
58  return wrong_format;
59 }
60 
61 EST_read_status
62 load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
63 {
64  (void)filename;
65  (void)n;
66  return wrong_format;
67 }
68 
69 EST_read_status
70 load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
71 {
72 
73  EST_TokenStream ts;
74  EST_String s;
75  int i,j,k, order=0;
76  double occur,weight;
77  int this_num,this_order;
78 
79  if (ts.open(filename) == -1)
80  return misc_read_error;
81 
82  // find backslash data backslash
83  while ((!ts.eof()) && !ts.get().string().contains("\\data\\"));
84 
85  if (ts.eof())
86  {
87  ts.close();
88  return wrong_format;
89  }
90 
91  // find order and numbers of ngrams
92 
93  // somewhere to keep numbers
94  EST_IVector nums(100); // not going to have anything bigger than a 100-gram !
95 
96  while (!ts.eof())
97  {
98  // have we got to next section
99  if (ts.peek().string().contains("-grams:"))
100  break;
101 
102  s=ts.get_upto_eoln().string();
103 
104  if(s.contains("ngram ") && s.contains("="))
105  {
106 
107  s=s.after("ngram ");
108  this_order=atoi(s.before("="));
109  this_num=atoi(s.after("="));
110 
111  //cerr << "There are " << this_num << " " << this_order
112  //<< "-grams" << endl;
113 
114  nums[this_order] = this_num;
115 
116  if(this_order > order)
117  order = this_order;
118  }
119 
120  }
121 
122 
123  if(order==0)
124  {
125  //cerr << "No ngram ?=? in header !" << endl;
126  ts.close();
127  return wrong_format;
128  }
129 
130  //cerr << "Initialising " << order << "-grammar" << endl;
131  if(!n.init(order,EST_Ngrammar::backoff,vocab))
132  return misc_read_error;
133 
134  // read data
135  for(i=1;i<=order;i++)
136  {
137 
138  EST_StrVector window(i);
139 
140  // find start of data for this order "<order>-grams:"
141  EST_String tmp = "\\" + itoString(i) + "-grams:";
142  while (!ts.eof())
143  {
144  s=ts.get().string();
145  if (s.contains(tmp))
146  break;
147  }
148 
149 
150  if(ts.eof())
151  {
152  cerr << "Unexpected end of grammar file whilst looking for '"
153  << tmp << "'" << endl;
154  return misc_read_error;
155  }
156 
157  //cerr << "Found order " << i << " : " << tmp << endl;
158  //cerr << "Looking for " << nums(i) << " ngrams" << endl;
159  // look for nums(i) ngrams
160 
161  for(j=0;j<nums(i);j++)
162  {
163 
164  for (k=0; ((k<i) && !ts.eof()); k++)
165  window[k] = ts.get().string();
166 
167  if(ts.eof())
168  {
169  cerr << "Unexpected end of file whilst reading " << i
170  << "-grams !" << endl;
171  return misc_read_error;
172  }
173 
174  occur = atof(ts.get().string());
175 
176 
177  // can't for backoff grammars, need to set probs directly
178 
179  cerr << "ooooooooops" << endl;
180  return wrong_format;
181  //n.accumulate(window,occur);
182 
183  // backoff weight ?
184  if (!ts.eoln())
185  {
186  weight = atof(ts.get().string());
187  n.set_backoff_weight(window,weight);
188  }
189 
190  if (!ts.eoln())
191  {
192  cerr << "EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
193  << ts.filepos() << endl;
194  ts.close();
195  return misc_read_error;
196  }
197  }
198 
199  } // loop through orders
200 
201 
202  // find backslash end backslash
203  while (!ts.eof())
204  if (ts.get().string() == "\\end\\")
205  {
206  ts.close();
207  return format_ok;
208 
209  }
210 
211  cerr << "Missing \\end\\ !" << endl;
212 
213  ts.close();
214  return misc_read_error;
215 
216 }
217 
218 EST_read_status
219 load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
220 {
221  EST_TokenStream ts;
222  int i, order;
223  double occur;
224 
225  if (ts.open(filename) == -1)
226  return misc_read_error;
227 
228  if (ts.peek().string() != "Ngram_2")
229  {
230  ts.close();
231  return wrong_format;
232  }
233  ts.get(); // skip magic number
234 
235  order = atoi(ts.get().string());
236  ts.get_upto_eoln(); // skip to next line
237  EST_StrList vocab;
238  EST_StrList pred_vocab; // may be different
239 
240  while (!ts.eoln())
241  vocab.append(ts.get().string());
242  ts.get_upto_eoln(); // skip to next line
243  while (!ts.eoln())
244  pred_vocab.append(ts.get().string());
245 
246  if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
247  {
248  cerr << "Something may be wrong with the vocab lists in '"
249  << filename << "'" << endl;
250  return misc_read_error;
251  }
252 
253  EST_StrVector window(order);
254 
255  while(!ts.eof())
256  {
257  for (i=0; i < order; i++)
258  window[i] = ts.get().string();
259  if (ts.get().string() != ":")
260  {
261  cerr << "EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
262  << ts.filepos() << endl;
263  return misc_read_error;
264  }
265  occur = atof(ts.get().string());
266  n.accumulate(window,occur);
267  if (!ts.eoln())
268  {
269  cerr << "EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
270  << ts.filepos() << endl;
271  return misc_read_error;
272  }
273  }
274 
275  ts.close();
276 
277  return format_ok;
278 }
279 
280 EST_read_status
281 load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
282 {
283  EST_TokenStream ts;
284  int i,j,order;
285  EST_Litem *k;
286  int num_entries;
287  double approx_num_samples = 0.0;
288  long freq_data_start, freq_data_end;
289  FILE *ifd;
290  int magic = 0;
291  int swap = FALSE;
292 
293  if ((ifd=fopen(filename,"rb")) == NULL)
294  return misc_read_error;
295  fread(&magic,sizeof(int),1,ifd);
296 
297  if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
298  swap = TRUE;
299  else if (magic != EST_NGRAMBIN_MAGIC)
300  return wrong_format;
301  if (ts.open(ifd, FALSE) == -1)
302  return misc_read_error;
303 
304  ts.set_SingleCharSymbols("\n");
305  ts.set_WhiteSpaceChars(" \t\r");
306 
307  if (ts.peek().string() != "mBin_2")
308  {
309  fclose(ifd);
310  ts.close();
311  return wrong_format;
312  }
313  ts.get(); // skip magic number
314 
315  order = atoi(ts.get().string());
316  if (ts.get() != "\n")
317  {
318  fclose(ifd);
319  ts.close();
320  return misc_read_error;
321  }
322  EST_StrList vocab;
323  EST_StrList pred_vocab; // may be different
324 
325  while ((ts.peek() != "\n") && (!ts.eof()))
326  vocab.append(ts.get().string());
327  ts.get(); // skip newline
328  while ((ts.peek() != "\n") && (!ts.eof()))
329  pred_vocab.append(ts.get().string());
330 
331  // Need to get to the position one after the newline and
332  // who knows what TokenStream has already read,
333  fseek(ifd,(long)(ts.peek().filepos()+5),SEEK_SET);
334 
335  if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
336  {
337  ts.close();
338  fclose(ifd);
339  return misc_read_error;
340  }
341 
342  EST_StrVector window(order);
343 
344  freq_data_start = ftell(ifd);
345  fseek(ifd,0,SEEK_END);
346  freq_data_end = ftell(ifd);
347  num_entries = (freq_data_end-freq_data_start)/sizeof(double);
348  double *dd = new double[num_entries];
349 
350  // Go back to start of data
351  fseek(ifd,freq_data_start,SEEK_SET);
352 
353  if (fread(dd,sizeof(double),num_entries,ifd) != (unsigned)num_entries)
354  {
355  cerr << "EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
356  ts.close();
357  fclose(ifd);
358  return misc_read_error;
359  }
360  if (swap)
361  swap_bytes_double(dd,num_entries);
362 
363  for(j=i=0;i<n.num_states();i++)
364  {
365  if (j >= num_entries)
366  {
367  cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
368  ts.close();
369  fclose(ifd);
370  return misc_read_error;
371  }
372  for (k=n.p_states[i].pdf().item_start();
373  (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ;
374  k = n.p_states[i].pdf().item_next(k))
375  {
376  n.p_states[i].pdf().set_frequency(k,dd[j]);
377  // Update global info too
378  approx_num_samples += dd[j]; // probably not right
379  n.vocab_pdf.cumulate(k,dd[j]);
380 
381  // Number of consecutive occurrences of this frequency as in
382  // dd[j+1] if its a negative number
383  if (j+1 >= num_entries)
384  j++;
385  else if (dd[j+1] < -1)
386  dd[j+1]++;
387  else if (dd[j+1] == -1)
388  j +=2;
389  else
390  j++;
391  }
392  }
393 
394  // With smoothing num_samples might not be as exact as you like
395  n.p_num_samples = (int)approx_num_samples;
396 
397  delete [] dd;
398 
399  ts.close();
400  fclose(ifd);
401 
402  return format_ok;
403 }
404 
405 // ====================================================================
406 
407 EST_write_status
408 save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost,
409  EST_Ngrammar &n, double floor)
410 {
411  EST_Litem *k;
412  EST_String name;
413  double freq;
414  EST_StrVector this_ngram(2); // assumes bigram
415  this_ngram[0] = word;
417  this_pdf = n.prob_dist(this_ngram);
418 
419  double lfreq=-1;
420  int lcount=0;
421  double total_freq=0;
422 
423  double floor_prob_total = floor * (n.pred_vocab->length()-1);
424 
425  if (word == n.p_sentence_end_marker)
426  {
427  *ost << word;
428  *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl;
429  return write_ok;
430  }
431 
432  if(floor_prob_total > 1)
433  {
434  cerr << "ERROR : floor is impossibly large, scaling it !" << endl;
435  floor = 1.0 / (double)(n.pred_vocab->length()-1);
436  floor_prob_total = 1;
437  }
438 
439  // not efficient but who cares ?
440  for (k=this_pdf.item_start();
441  !this_pdf.item_end(k);
442  k = this_pdf.item_next(k))
443  {
444  this_pdf.item_freq(k,name,freq);
445  if(name != n.p_sentence_start_marker)
446  {
447  total_freq += freq;
448  }
449  }
450 
451 
452  // 0 for prob(word,start marker)
453  *ost << word << " 0 ";
454 
455  if (total_freq <= 0)
456  {
457  *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*";
458  *ost << n.pred_vocab->length()-1 << " " << endl;
459  }
460  else
461  {
462  lfreq=-1;
463 
464  for (k=this_pdf.item_start();
465  !this_pdf.item_end(k);
466  k = this_pdf.item_next(k))
467  {
468  this_pdf.item_freq(k,name,freq);
469 
470  if ( (name == n.p_sentence_start_marker) ||
471  (name == n.p_sentence_end_marker) ||
472  (name == OOV_MARKER) )
473  continue;
474 
475  if (freq == lfreq)
476  lcount++;
477  else
478  {
479  if (lcount > 1)
480  *ost << "*" << lcount << " ";
481  else
482  *ost << " ";
483 
484  lcount=1;
485  lfreq = freq;
486 
487  if(freq > 0)
488  {
489  double base_prob = freq / total_freq;
490 
491  // and floor/scale it
492  *ost << floor + ( base_prob * (1-floor_prob_total) );
493 
494  }
495  else
496  *ost << floor;
497 
498  }
499 
500 
501  }
502 
503  } // total_freq > 0
504 
505 
506  if(!n.closed_vocab())
507  {
508 
509  // not fully tested !!!!!!!!
510 
511  *ost << 0 << " ERROR !!!!!!!! ";
512  }
513 
514 
515  if (total_freq > 0)
516  {
517  freq = this_pdf.frequency(n.p_sentence_end_marker);
518 
519  if(freq == lfreq)
520  {
521  lcount++;
522  *ost << "*" << lcount << " " << endl;
523  }
524  else
525  {
526 
527  if (lcount > 1)
528  *ost << "*" << lcount << " ";
529  else
530  *ost << " ";
531 
532  if(freq > 0)
533  {
534  double base_prob = freq / total_freq;
535 
536  // and floor/scale it
537  *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
538 
539  }
540  else
541  *ost << floor << endl;
542  }
543  }
544 
545  return write_ok;
546 }
547 
548 EST_write_status
549 save_ngram_htk_ascii(const EST_String filename,
550  EST_Ngrammar &n, double floor)
551 {
552 
553  ostream *ost;
554 
555  // only for bigram
556  if(n.order() != 2)
557  {
558  cerr << "Can only save bigrams in htk_ascii format" << endl;
559  return misc_write_error;
560  }
561 
562  if (floor < 0)
563  {
564  cerr << "Negative floor probability does not make sense !" << endl;
565  return misc_write_error;
566  }
567 
568  if (filename == "-")
569  ost = &cout;
570  else
571  ost = new ofstream(filename);
572 
573  if(!(*ost))
574  return write_fail;
575 
576  if(floor * (n.pred_vocab->length()-1) > 1)
577  {
578  floor = 1.0 / (double)(n.pred_vocab->length()-1);
579  cerr << "ERROR : floor is impossibly large, scaling it to ";
580  cerr << floor << endl;
581  }
582 
583  int i;
584 
585  if(n.p_sentence_start_marker == "")
586  {
587  cerr << "Can't save in HTK format as no sentence start/end tags"
588  << " were given !" << endl;
589  return misc_write_error;
590  }
591 
592  // need '!ENTER' (or whatever) as first word- that's HTK for you
593  save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);
594 
595  // the real words
596  for(i=0;i<n.vocab->length();i++)
597  {
598  if ( (n.vocab->name(i) != n.p_sentence_start_marker) &&
599  (n.vocab->name(i) != n.p_sentence_end_marker) &&
600  (n.vocab->name(i) != OOV_MARKER) )
601  save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor);
602  }
603 
604  if(!n.closed_vocab())
605  save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);
606 
607  save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);
608 
609  if(ost != &cout)
610  delete ost;
611 
612  return write_ok;
613 }
614 
615 /*
616  EST_write_status
617  save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
618  {
619  return write_ok;
620  }
621  */
622 
623 void
624 count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
625 {
626  if(n->ngram_exists(ngram))
627  *((double*)count) += 1;
628 }
629 
630 void
631 save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
632 {
633 
634  int i;
635 
636  if(n->ngram_exists(ngram))
637  {
638  *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " ";
639  for(i=0;i<ngram.n();i++)
640  *((ostream*)(ost)) << ngram(i) << " ";
641 
642  if ((n->representation() == EST_Ngrammar::backoff) &&
643  (n->order() > ngram.n()) )
644  *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
645  //<< " = "
646  //<< n->get_backoff_weight(ngram) << " ";
647 
648  *((ostream*)(ost)) << endl;
649 
650  }
651 }
652 
653 EST_write_status
654 save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
655 {
656  // ARPA MIT-LL format - see HTK manual !!
657 
658  ostream *ost;
659  int i,num_n,o;
660 
661  if (filename == "-")
662  ost = &cout;
663  else
664  ost = new ofstream(filename);
665 
666  if (!(*ost))
667  return write_fail;
668 
669  //n.set_entry_type(EST_Ngrammar::probabilities);
670  //n.make_htk_compatible(); // fix enter/exit probs
671  //*ost << *(n.vocab) << endl;
672 
673  // count number of ngrams
674  num_n = (int)n.samples();
675  *ost << "\\data\\" << endl;
676 
677  double *count = new double;
678 
679  if (n.representation() == EST_Ngrammar::backoff)
680  {
681  for(o=1;o<=n.order();o++)
682  {
683  EST_StrVector ngram(o);
684  for(i=0;i<o;i++)
685  ngram[i] = "";
686  *count =0;
687 
688  // this is a deeply silly way to count them,
689  // we could traverse the tree directly !
690  n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
691  *ost << "ngram " << o << "=" << *count << endl;
692  }
693 
694  for(o=1;o<=n.order();o++)
695  {
696  *ost << endl;
697  *ost << "\\" << o << "-grams:" << endl;
698  EST_StrVector ngram(o);
699  for(i=0;i<o;i++)
700  ngram[i] = "";
701  n.iterate(ngram,&save_ngram_arpa_sub,(void*)ost);
702  }
703 
704  }
705  else
706  {
707  EST_StrVector ngram(n.order());
708  for(i=0;i<n.order();i++)
709  ngram[i] = "";
710  *count =0;
711  n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
712  *ost << "ngram " << n.order() << "=" << *count << endl;
713 
714  *ost << endl;
715  *ost << "\\" << n.order() << "-grams:" << endl;
716 
717  for(i=0;i<n.order();i++)
718  ngram[i] = "";
719  n.iterate(ngram,&save_ngram_arpa_sub,ost);
720 
721  }
722 
723  *ost << "\\end\\" << endl;
724 
725  if (ost != &cout)
726  delete ost;
727 
728  return write_ok;
729 }
730 
731 EST_write_status
732 save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n,
733  const bool trace, double floor)
734 {
735  // awb's format
736  (void)trace;
737  ostream *ost;
738  int i;
739  EST_Litem *k;
740 
741  if (filename == "-")
742  ost = &cout;
743  else
744  ost = new ofstream(filename);
745 
746  if(!(*ost))
747  return write_fail;
748 
749  *ost << "Ngram_2 " << n.order() << endl;
750  for (i=0; i < n.vocab->length(); i++)
751  *ost << n.vocab->name(i) << " ";
752  *ost << endl;
753  for (i=0; i < n.pred_vocab->length(); i++)
754  *ost << n.pred_vocab->name(i) << " ";
755  *ost << endl;
756 
757  if (n.representation() == EST_Ngrammar::dense)
758  n.print_freqs(*ost,floor);
759  else if (n.representation() == EST_Ngrammar::backoff)
760  {
761  int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
762 
763  for(i=0;i<total_ngrams;i++)
764  {
766  const EST_StrVector this_ngram = n.make_ngram_from_index(i);
767  this_pdf = n.prob_dist(this_ngram);
768 
769  for (k=this_pdf.item_start();
770  !this_pdf.item_end(k);
771  k = this_pdf.item_next(k))
772  {
773  double freq;
774  EST_String name;
775  this_pdf.item_freq(k,name,freq);
776 
777  for (int jj=0; jj < this_ngram.n(); jj++)
778  *ost << this_ngram(jj) << " ";
779  *ost << name << " : " << freq << endl;
780  }
781  }
782  }
783 
784  if(ost != &cout)
785  delete ost;
786 
787  return write_ok;
788 }
789 
790 EST_write_status
791 save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
792 {
793  // Save as a WFST
794  FILE *ost;
795  int i;
796 
797  if ((ost = fopen(filename,"wb")) == NULL)
798  {
799  cerr << "Ngrammar save: unable to open \"" << filename <<
800  "\" for writing" << endl;
801  return write_fail;
802  }
803 
804  fprintf(ost,"EST_File fst\n");
805  fprintf(ost,"DataType ascii\n");
806  fprintf(ost,"in \"(");
807  for (i=0; i < n.vocab->length(); i++)
808  fprintf(ost," %s\n",(const char *)n.vocab->name(i));
809  fprintf(ost," )\"\n");
810  fprintf(ost,"out \"(");
811  for (i=0; i < n.vocab->length(); i++)
812  fprintf(ost," %s\n",(const char *)n.vocab->name(i));
813  fprintf(ost," )\"\n");
814  fprintf(ost,"NumStates %d\n",n.num_states());
815  fprintf(ost,"EST_Header_End\n");
816 
817  for (i=0; i<n.num_states(); i++)
818  {
819  fprintf(ost,"((%d nonfinal %d)\n",i,i);
820  fprintf(ost,")\n");
821  }
822 
823  fclose(ost);
824 
825  return write_ok;
826 }
827 
828 EST_write_status
829 save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n,
830  const bool trace, double floor)
831 {
832 
833  if (n.representation() == EST_Ngrammar::sparse)
834  return misc_write_error;
835 
836  int i;
837  EST_Litem *k;
838  FILE *ofd;
839  double lfreq = -1;
840  double count = -1;
841  int magic = EST_NGRAMBIN_MAGIC;
842 
843  if (filename == "-")
844  {
845  if ((ofd=stdout) == NULL)
846  return misc_write_error;
847  }
848  else
849  {
850  if ((ofd=fopen(filename,"wb")) == NULL)
851  return misc_write_error;
852  }
853 
854  fwrite(&magic,sizeof(int),1,ofd);
855  fprintf(ofd,"mBin_2 %d\n",n.order());
856  for (i=0; i < n.vocab->length(); i++)
857  fprintf(ofd,"%s ",(const char *)n.vocab->name(i));
858  fprintf(ofd,"\n");
859  for (i=0; i < n.pred_vocab->length(); i++)
860  fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i));
861  fprintf(ofd,"\n");
862 
863  // We use a simple form of run-length encoding, if consecutive
864  // values are equal only a length is printed. lengths are
865  // negative as frequencies (even smoothed ones) can never be -ve
866 
867  if ( trace )
868  cerr << "Saving ..." << endl;
869 
870  if (n.representation() == EST_Ngrammar::dense)
871  {
872  for(i=0;i<n.num_states();i++)
873  {
874 
875  if ( trace )
876  cerr << "\r" << i*100/n.num_states() << "%";
877 
878  for (k=n.p_states[i].pdf().item_start();
879  !n.p_states[i].pdf().item_end(k);
880  k = n.p_states[i].pdf().item_next(k))
881  {
882  double freq;
883  EST_String name;
884  n.p_states[i].pdf().item_freq(k,name,freq);
885  if (freq == 0.0)
886  freq = floor;
887  if (freq == lfreq)
888  count--;
889  else
890  {
891  if (count < -1)
892  fwrite(&count,sizeof(double),1,ofd);
893  fwrite(&freq,sizeof(double),1,ofd);
894  count = -1;
895  }
896  lfreq = freq;
897  }
898  }
899  if (count < -1)
900  fwrite(&count,sizeof(double),1,ofd);
901  }
902  else if (n.representation() == EST_Ngrammar::backoff)
903  {
904  // need to construct pdfs in right order
905  // noting that dense states are indexed s.t. the last
906  // word in the ngram is the least significant 'bit'
907 
908  // number of ngrams, excluding last word, is
909  int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
910 
911  for(i=0;i<total_ngrams;i++)
912  {
913 
914  if ( trace )
915  cerr << "\r" << i*100/total_ngrams << "%";
916 
918  const EST_StrVector this_ngram = n.make_ngram_from_index(i);
919  this_pdf = n.prob_dist(this_ngram);
920 
921  for (k=this_pdf.item_start();
922  !this_pdf.item_end(k);
923  k = this_pdf.item_next(k))
924  {
925 
926  double freq;
927  EST_String name;
928  this_pdf.item_freq(k,name,freq);
929  if (freq == lfreq)
930  count--;
931  else
932  {
933  if (count < -1)
934  fwrite(&count,sizeof(double),1,ofd);
935  fwrite(&freq,sizeof(double),1,ofd);
936  count = -1;
937  }
938  lfreq = freq;
939  }
940 
941 
942  }
943 
944  }
945  if ( trace )
946  cerr << "\r \r" << endl;
947 
948  fclose(ofd);
949 
950  return write_ok;
951 }