Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
scfg_train_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Train a stochastic context free grammar with respect to a given */
37 /* corpus. */
38 /* */
39 /* Only the inside/outside algorithm (with bracketing) is supported */
40 /* */
41 /* */
42 /*=======================================================================*/
43 #include <cstdlib>
44 #include <cstdio>
45 #include <iostream>
46 #include <fstream>
47 #include <cstring>
48 #include "EST_cmd_line.h"
49 #include "EST_SCFG.h"
50 #include "siod.h"
51 
52 static EST_String outfile = "-";
53 
54 
55 static int scfg_train_main(int argc, char **argv);
56 
57 /** @name <command>scfg_train</command> <emphasis>Train the parameters of a stochastic context free grammar</emphasis>
58  @id scfg-make-manual
59  * @toc
60  */
61 
62 //@{
63 
64 
65 /**@name Synopsis
66  */
67 //@{
68 
69 //@synopsis
70 
71 /**
72 
73 scfg_train takes a stochastic context free grammar (SCFG) and trains
74 the probabilities with respect to a given bracket corpus using the
75 inside-outside algorithm. This is basically an implementation
76 of Pereira and Schabes 1992.
77 
78 Note using this program properly may require months of CPU time.
79 
80  */
81 
82 //@}
83 
84 /**@name OPTIONS
85  */
86 //@{
87 
88 
89 //@options
90 
91 //@}
92 
93 
94 int main(int argc, char **argv)
95 {
96 
97  scfg_train_main(argc,argv);
98 
99  exit(0);
100  return 0;
101 }
102 
103 static int scfg_train_main(int argc, char **argv)
104 {
105  // Top level function generates a probabilistic grammar
106  EST_Option al;
107  EST_StrList files;
108  int spread;
109 
110  parse_command_line
111  (argc, argv,
112  EST_String("[options\n")+
113  "Summary: Train a stochastic context free grammar from a (bracketed) corpus\n"+
114  "-grammar <ifile> Grammar file, one rule per line.\n"+
115  "-corpus <ifile> Corpus file, one bracketed sentence per line.\n"+
116  "-method <string> {inout}\n"+
117  " Method for training: inout.\n"+
118  "-passes <int> {50}\n"+
119  " Number of training passes.\n"+
120  "-startpass <int> {0}\n"+
121  " Starting at pass N.\n"+
122  "-spread <int> Spread training data over N passes.\n"+
123  "-checkpoint <int> Save grammar every N passes\n"+
124  "-heap <int> {210000}\n"+
125  " Set size of Lisp heap, needed for large corpora\n"+
126  "-o <ofile> Output file for trained grammar.\n",
127  files, al);
128 
129  if (al.present("-o"))
130  outfile = al.val("-o");
131  else
132  outfile = "-";
133 
134  siod_init(al.ival("-heap"));
135 
136  EST_SCFG_traintest grammar;
137 
138  if (al.present("-grammar"))
139  {
140  grammar.load(al.val("-grammar"));
141  }
142  else
143  {
144  cerr << "scfg_train: no grammar specified" << endl;
145  exit(1);
146  }
147 
148  if (al.present("-corpus"))
149  {
150  grammar.load_corpus(al.val("-corpus"));
151  }
152  else
153  {
154  cerr << "scfg_train: no corpus specified" << endl;
155  exit(1);
156  }
157 
158  if (al.present("-spread"))
159  spread = al.ival("-spread");
160  else
161  spread = 0;
162 
163  if (al.val("-method") == "inout")
164  {
165  int checkpoint = -1;
166  if (al.present("-checkpoint"))
167  checkpoint = al.ival("-checkpoint");
168 
169  grammar.train_inout(al.ival("-passes"),
170  al.ival("-startpass"),
171  checkpoint,spread,outfile);
172  }
173  else
174  {
175  cerr << "scfg_train: unknown training method \"" <<
176  al.val("-method") << "\"" << endl;
177  exit(1);
178  }
179 
180  if (grammar.save(outfile) != write_ok)
181  {
182  cerr << "scfg_train: failed to write grammar to \"" <<
183  outfile << "\"" << endl;
184  exit(1);
185  }
186 
187  return 0;
188 }