Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
scfg_make_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Build a stochastic context feee grammar with N non-terminals and */
37 /* M terminals specific as lists or numbers */
38 /* Probabilities are either even or random on rules and specified as */
39 /* probs or -log prob */
40 /* */
41 /*=======================================================================*/
42 #include <cstdlib>
43 #include <cstdio>
44 #include <iostream>
45 #include <fstream>
46 #include <cstring>
47 #include "EST.h"
48 #include "EST_SCFG.h"
49 #include "siod.h"
50 
51 EST_String outfile = "-";
52 EST_String domain = "nlogp";
53 EST_String values = "equal";
54 
55 static int scfg_make_main(int argc, char **argv);
56 
57 static void load_symbols(EST_StrList &syms,const EST_String &filename);
58 static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix);
59 static LISP assign_probs(LISP rules, const EST_String &domain,
60  const EST_String &values);
61 static LISP make_all_rules(const EST_StrList &NonTerminals,
62  const EST_StrList &Terminals);
63 static void generate_probs(double *probs,int num);
64 
65 /** @name <command>scfg_make</command> <emphasis>Make the rules for a stochastic context free grammar</emphasis>
66  @id scfg-make-manual
67  * @toc
68  */
69 
70 //@{
71 
72 
73 /**@name Synopsis
74  */
75 //@{
76 
77 //@synopsis
78 
79 /**
80 
81 Builds a stochastic context free grammar from a vocabulary of non-terminal
82 and terminal symbols. An exhaustive set of all possible binary rules
83 are generated with random (or equal) probabilities (or negative log
84 probabilities). This program is designed for making grammars that
85 can be trained using scfg_train.
86 
87  */
88 
89 //@}
90 
91 /**@name OPTIONS
92  */
93 //@{
94 
95 //@options
96 
97 //@}
98 
99 
100 int main(int argc, char **argv)
101 {
102 
103  scfg_make_main(argc,argv);
104 
105  exit(0);
106  return 0;
107 }
108 
109 static int scfg_make_main(int argc, char **argv)
110 {
111  // Top level function generates a probabilistic grammar
112  EST_Option al;
113  EST_StrList files;
114  EST_StrList NonTerminals, Terminals;
115  LISP rules,r;
116  FILE *fd;
117 
118  parse_command_line
119  (argc, argv,
120  EST_String("[options]\n")+
121  "Summary: Build a stochastic context free grammar\n"+
122  "-nonterms <string> Number of nonterminals or file containing them\n"+
123  "-terms <string> Number of terminals or file containing them\n"+
124  "-domain <string> {nlogp}\n"+
125  " Values to be nlogp (negative log probabilities)\n"+
126  " or prob (probabilities)\n"+
127  "-values <string> {equal}\n"+
128  " General initial scores on rules as equal or\n"
129  " random\n"+
130  "-heap <int> {500000}\n"+
131  " Set size of Lisp heap, only needed for large grammars\n"+
132  "-o <ofile> File to save grammar (default stdout)\n",
133  files, al);
134 
135  if (al.present("-o"))
136  outfile = al.val("-o");
137  else
138  outfile = "-";
139 
140  if (al.present("-domain"))
141  {
142  if (al.val("-domain") == "nlogp")
143  domain = "nlogp";
144  else if (al.val("-domain") == "prob")
145  domain = "prob";
146  else
147  {
148  cerr << "scfg_make: domain must be nlogp or prob" << endl;
149  exit(1);
150  }
151  }
152 
153  if (al.present("-values"))
154  {
155  if (al.val("-values") == "equal")
156  values = "equal";
157  else if (al.val("-values") == "random")
158  values = "random";
159  else
160  {
161  cerr << "scfg_make: values must be equal or random" << endl;
162  exit(1);
163  }
164  }
165 
166  if (al.present("-nonterms"))
167  {
168  if (al.val("-nonterms").matches(RXint))
169  make_symbols(NonTerminals,al.ival("-nonterms"),"NT");
170  else
171  load_symbols(NonTerminals,al.val("-nonterms"));
172  }
173  else
174  {
175  cerr << "scfg_make: no nonterminals specified" << endl;
176  exit(1);
177  }
178 
179  if (al.present("-terms"))
180  {
181  if (al.val("-terms").matches(RXint))
182  make_symbols(Terminals,al.ival("-terms"),"T");
183  else
184  load_symbols(Terminals,al.val("-terms"));
185  }
186  else
187  {
188  cerr << "scfg_make: no terminals specified" << endl;
189  exit(1);
190  }
191 
192  siod_init(al.ival("-heap"));
193 
194  rules = make_all_rules(NonTerminals,Terminals);
195  rules = assign_probs(rules,domain,values);
196 
197  if (outfile == "-")
198  fd = stdout;
199  else
200  {
201  if ((fd=fopen(outfile,"w")) == NULL)
202  {
203  cerr << "scfg_make: failed to open file \"" << outfile <<
204  "\" for writing" << endl;
205  exit(1);
206  }
207  }
208 
209  for (r=rules; r != NIL; r=cdr(r))
210  pprint_to_fd(fd,car(r));
211 
212  if (fd != stdout)
213  fclose(fd);
214 
215 
216  return 0;
217 }
218 
219 static LISP make_all_rules(const EST_StrList &NonTerminals,
220  const EST_StrList &Terminals)
221 {
222  // Build all possibly rules (CNF)
223  // NT -> NT NT and NT -> T
224  EST_Litem *p,*q,*r;
225  LISP rules = NIL;
226 
227  for (p=NonTerminals.head(); p != 0; p=p->next())
228  {
229  int num_rules_nt = (NonTerminals.length()*NonTerminals.length())+
230  Terminals.length();
231  double *probs = new double[num_rules_nt];
232  generate_probs(probs,num_rules_nt);
233  int i=0;
234  for (q=NonTerminals.head(); q != 0; q=q->next())
235  for (r=NonTerminals.head(); r != 0; r=r->next(),i++)
236  rules = cons(cons(flocons(probs[i]),
237  cons(rintern(NonTerminals(p)),
238  cons(rintern(NonTerminals(q)),
239  cons(rintern(NonTerminals(r)),NIL)))),
240  rules);
241  for (q=Terminals.head(); q != 0; q=q->next(),i++)
242  rules = cons(cons(flocons(probs[i]),
243  cons(rintern(NonTerminals(p)),
244  cons(rintern(Terminals(q)),NIL))),
245  rules);
246  delete [] probs;
247  }
248 
249  return reverse(rules);
250 }
251 
252 static void generate_probs(double *probs,int num)
253 {
254  // Generate probabilities
255  int i;
256 
257  if (values == "equal")
258  {
259  double defp = 1.0/(float)num;
260  for (i=0; i < num; i++)
261  probs[i] = defp;
262  }
263  else if (values == "random")
264  {
265  // This isn't random but is somewhat arbitrary
266  double sum = 0;
267  for (i=0; i < num; i++)
268  {
269  probs[i] = (double)abs(rand())/(double)0x7fff;
270  sum += probs[i];
271  }
272  for (i=0; i < num; i++)
273  {
274  probs[i] /= sum;
275  }
276  }
277  else
278  {
279  cerr << "scfg_make: unknown value for probability distribution"
280  << endl;
281  exit(1);
282  }
283 }
284 
285 static LISP assign_probs(LISP rules, const EST_String &domain,
286  const EST_String &values)
287 {
288  // Modify probs (don't know how to do random probs yet)
289  LISP r;
290  (void)values;
291 
292  if (domain == "nlogp")
293  for (r=rules; r != NIL; r = cdr(r))
294  {
295  if (get_c_float(car(car(r))) == 0)
296  CAR(car(r)) = flocons(40);
297  else
298  CAR(car(r)) = flocons(-log(get_c_float(car(car(r)))));
299  }
300 
301  return rules;
302 }
303 
304 static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix)
305 {
306  // Generate n symbols with given prefix
307  int i;
308  int magnitude,t;
309 
310  for (magnitude=0,t=n; t > 0; t=t/10)
311  magnitude++;
312 
313  char *name = walloc(char,prefix.length()+magnitude+1);
314  char *skel = walloc(char,prefix.length()+5);
315  sprintf(skel,"%s%%%02dd",(const char *)prefix,magnitude);
316 
317  for (i=0; i < n; i++)
318  {
319  sprintf(name,skel,i);
320  syms.append(name);
321  }
322 
323  wfree(name);
324  wfree(skel);
325 
326 }
327 
328 
329 static void load_symbols(EST_StrList &syms,const EST_String &filename)
330 {
331  // Load symbol list for file
332 
333  load_StrList(filename,syms);
334 
335 }