Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_SCFG.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Stochastic context free grammars */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_SCFG_H__
41 #define __EST_SCFG_H__
42 
43 #include "EST_simplestats.h"
44 #include "EST_rw_status.h"
45 #include "EST_TList.h"
46 #include "siod.h"
47 
48 /** This class represents a bracketed string used in training of SCFGs.
49 
50  An object in this class builds an index of valid bracketing of
51  the string, thus offering both a tree like access and direct
52  access to the leafs of the tree. The definition of ``valid
53  bracketing'' is any substring \[ W_{i,j} \] that doesn't cross any
54  brackets.
55 */
57  private:
58  int p_length;
59  LISP *symbols;
60  LISP bs;
61  int **valid_spans; // triangular matrix
62  int find_num_nodes(LISP string);
63  int set_leaf_indices(LISP string,int i,LISP *symbols);
64  int num_leafs(LISP l) const;
65  void find_valid(int i,LISP t) const;
66  void init();
67  public:
68  ///
70  ///
71  EST_bracketed_string(LISP string);
72  ///
74 
75  ///
76  void set_bracketed_string(LISP string);
77  ///
78  int length() const {return p_length;}
79  ///
80  LISP string() const { return bs; }
81  /// The nth symbol in the string.
82  const EST_String symbol_at(int i) const
83  { return EST_String(get_c_string(car(symbols[i]))); }
84  /// If a bracketing from i to k is valid in string
85  int valid(int i,int k) const { return valid_spans[i][k]; }
86 
87  ///
88  int operator !=(const EST_bracketed_string &a) const
89  { return (!(this == &a)); }
90  int operator ==(const EST_bracketed_string &a) const
91  { return ((this == &a)); }
92  ///
93  friend ostream& operator << (ostream &s, const EST_bracketed_string &a)
94  { (void)a; s << "[a bracketed string]" << endl; return s; }
95 
96 };
97 
99 
100 // Only support Chomsky Normal Form at present
101 enum est_scfg_rtype {est_scfg_unset, est_scfg_binary_rule,
102  est_scfg_unary_rule};
103 
104 /** A stochastic context free grammar rule.
105 
106  At present only two types of rule are supported:
107  {\tt est\_scfg\_binary\_rule} and {\tt est\_scfg\_unary\_rule}.
108  This is sufficient for the representation of grammars in
109  Chomsky Normal Form. Each rule also has a probability associated
110  with it. Terminals and noterminals are represented as ints using
111  the \Ref{EST_Discrete}s in \Ref{EST_SCFG} to reference the actual
112  alphabets.
113 
114  Although this class includes a ``probability'' nothing in the rule
115  itself enforces it to be a true probability. It is responsibility
116  of the classes that use this rule to enforce that condition if
117  desired.
118 
119  @author Alan W Black (awb@cstr.ed.ac.uk): October 1997
120 */
122  private:
123  int p_mother;
124  int p_daughter1;
125  int p_daughter2;
126  est_scfg_rtype p_type;
127  double p_prob;
128  public:
129  ///
130  EST_SCFG_Rule() {p_type=est_scfg_unset; p_prob=0;}
131  ///
132  EST_SCFG_Rule(const EST_SCFG_Rule &r)
133  {p_mother = r.p_mother; p_daughter1 = r.p_daughter1;
134  p_daughter2 = r.p_daughter2; p_type=r.p_type; p_prob = r.p_prob;}
135  /// Create a unary rule.
136  EST_SCFG_Rule(double prob,int p,int m);
137  /// Create a binary rule.
138  EST_SCFG_Rule(double prob,int p, int q, int r);
139  /// The rule's probability
140  double prob() const {return p_prob;}
141  /// set the probability
142  void set_prob(double p) { p_prob=p;}
143  /// rule type
144  est_scfg_rtype type() const { return p_type; }
145  ///
146  int mother() const {return p_mother;}
147  /** In a unary rule this is a terminal, in a binary rule it
148  is a nonterminal
149  */
150  int daughter1() const {return p_daughter1;}
151  ///
152  int daughter2() const {return p_daughter2;}
153  ///
154  void set_rule(double prob,int p, int m);
155  ///
156  void set_rule(double prob,int p, int q, int r);
157 };
158 
160 
161 /** A class representing a stochastic context free grammar (SCFG).
162 
163  This class includes the representation of the grammar itself and
164  methods for training and testing it against some corpus.
165 
166  At presnet of grammars in Chomsky Normal Form are supported. That
167  is rules may be binary or unary. If binary the mother an two
168  daughters are nonterminals, if unary the mother must be nonterminal
169  and daughter a terminal symbol.
170 
171  The terminals and nonterminals symbol sets are derived automatically
172  from the LISP representation of the rules at initialization time
173  and are represented as \Ref{EST_Discrete}s. The distinguished
174  symbol is assumed to be the first mother of the first rule in
175  the given grammar.
176 
177 */
178 class EST_SCFG {
179  private:
180  EST_Discrete nonterminals;
181  EST_Discrete terminals;
182  int p_distinguished_symbol;
183  // Index of probabilities for binary rules in grammar
184  double ***p_prob_B;
185  // Index of probabilities for unary rules in grammar
186  double **p_prob_U;
187  // Build rule probability caches
188  void rule_prob_cache();
189  // Delete rule probability caches
190  void delete_rule_prob_cache();
191  public:
192  /**@name Constructor and initialisation functions */
193  //@{
194  EST_SCFG();
195  /// Initialize from a set of rules
196  EST_SCFG(LISP rules);
197  ~EST_SCFG();
198  //@}
199 
200  /**@name utility functions */
201  //@{
202  /// Set (or reset) rules from external source after construction
203  void set_rules(LISP rules);
204  /// Return rules as LISP list.
205  LISP get_rules();
206  /// The rules themselves
207  SCFGRuleList rules;
208  int distinguished_symbol() const { return p_distinguished_symbol; }
209  /** Find the terminals and nonterminals in the given grammar, adding
210  them to the appropriate given string lists.
211  */
212  void find_terms_nonterms(EST_StrList &nt, EST_StrList &t,LISP rules);
213  /// Convert nonterminal index to string form
214  EST_String nonterminal(int p) const { return nonterminals.name(p); }
215  /// Convert terminal index to string form
216  EST_String terminal(int m) const { return terminals.name(m); }
217  /// Convert nonterminal string to index
218  int nonterminal(const EST_String &p) const { return nonterminals.name(p); }
219  /// Convert terminal string to index
220  int terminal(const EST_String &m) const { return terminals.name(m); }
221  /// Number of nonterminals
222  int num_nonterminals() const { return nonterminals.length(); }
223  /// Number of terminals
224  int num_terminals() const { return terminals.length(); }
225  /// The rule probability of given binary rule
226  double prob_B(int p, int q, int r) const { return p_prob_B[p][q][r]; }
227  /// The rule probability of given unary rule
228  double prob_U(int p, int m) const { return p_prob_U[p][m]; }
229  /// (re-)set rule probability caches
230  void set_rule_prob_cache();
231  //@}
232 
233  /**@name file i/o functions */
234  //@{
235  /// Load grammar from named file
236  EST_read_status load(const EST_String &filename);
237  /// Save current grammar to named file
238  EST_write_status save(const EST_String &filename);
239  //@}
240 };
241 
242 /** A class used to train (and test) SCFGs is an extension of
243  \Ref{EST_SCFG}.
244 
245  This offers an implementation of Pereira and Schabes ``Inside-Outside
246  reestimation from partially bracket corpora.'' ACL 1992.
247 
248  A SCFG maybe trained from a corpus (optionally) containing brackets
249  over a series of passes reestimating the grammar probabilities
250  after each pass. This basically extends the \Ref{EST_SCFG} class
251  adding support for a bracket corpus and various indexes for efficient
252  use of the grammar.
253 */
254 class EST_SCFG_traintest : public EST_SCFG {
255  private:
256  /// Index for inside probabilities
257  double ***inside;
258  /// Index for outside probabilities
259  double ***outside;
260  EST_Bcorpus corpus;
261  /// Partial (numerator) for reestimation
262  EST_DVector n;
263  /// Partial (denominator) for reestimation
264  EST_DVector d;
265 
266  /// Calculate inside probability.
267  double f_I_cal(int c, int p, int i, int k);
268  /// Lookup or calculate inside probability.
269  double f_I(int c, int p, int i, int k)
270  { double r;
271  if ((r=inside[p][i][k]) != -1) return r;
272  else return f_I_cal(c,p,i,k); }
273  /// Calculate outside probability.
274  double f_O_cal(int c, int p, int i, int k);
275  /// Lookup or calculate outside probability.
276  double f_O(int c, int p, int i, int k)
277  { double r;
278  if ((r=outside[p][i][k]) != -1) return r;
279  else return f_O_cal(c,p,i,k); }
280  /// Find probability of parse of corpus sentence {\tt c}
281  double f_P(int c);
282  /** Find probability of parse of corpus sentence {\tt c} for
283  nonterminal {\tt p}
284  */
285  double f_P(int c,int p);
286  /// Re-estimate probability of binary rule using inside-outside algorithm
287  void reestimate_rule_prob_B(int c, int ri, int p, int q, int r);
288  /// Re-estimate probability of unary rule using inside-outside algorithm
289  void reestimate_rule_prob_U(int c, int ri, int p, int m);
290  /// Do grammar re-estimation
291  void reestimate_grammar_probs(int passes,
292  int startpass,
293  int checkpoint,
294  int spread,
295  const EST_String &outfile);
296  ///
297  double cross_entropy();
298  /// Initialize the cache for inside/outside values for sentence {\tt c}
299  void init_io_cache(int c,int nt);
300  /// Clear the cache for inside/outside values for sentence {\tt c}
301  void clear_io_cache(int c);
302  public:
305 
306  /** Test the current grammar against the current corpus print summary.
307 
308  Cross entropy measure only is given.
309  */
310  void test_corpus();
311  /** Test the current grammar against the current corpus.
312 
313  Summary includes percentage of cross bracketing accuracy
314  and percentage of fully correct parses.
315  */
316  void test_crossbrackets();
317 
318  /** Load a corpus from the given file.
319 
320  Each sentence in the corpus should be contained in parentheses.
321  Additional parenthesis may be used to denote phrasing within
322  a sentence. The corpus is read using the LISP reader so LISP
323  conventions shold apply, notable single quotes should appear
324  within double quotes.
325  */
326  void load_corpus(const EST_String &filename);
327 
328  /** Train a grammar using the loaded corpus.
329 
330  @param passes the number of training passes desired.
331  @param startpass from which pass to start from
332  @param checkpoint save the grammar every n passes
333  @param spread Percentage of corpus to use on each pass,
334  this cycles through the corpus on each pass.
335  */
336  void train_inout(int passes,
337  int startpass,
338  int checkpoint,
339  int spread,
340  const EST_String &outfile);
341 };
342 
343 /** From a full parse, extract the string with bracketing only.
344 */
345 LISP scfg_bracketing_only(LISP parse);
346 /** Cummulate cross bracketing information between ref and test.
347  */
348 void count_bracket_crossing(const EST_bracketed_string &ref,
349  const EST_bracketed_string &test,
350  EST_SuffStats &vs);
351 
352 #endif