Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_WFST.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : November 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Weighted Finite State Transducers */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_WFST_H__
41 #define __EST_WFST_H__
42 
43 #include "EST_simplestats.h"
44 #include "EST_rw_status.h"
45 #include "EST_Option.h"
46 #include "EST_TList.h"
47 #include "EST_TVector.h"
48 #include "EST_THash.h"
49 #include "siod.h"
50 #define wfst_error_msg(WMESS) (cerr << WMESS << endl,siod_error())
51 
52 #define WFST_ERROR_STATE -1
53 
54 class EST_WFST_State;
55 class EST_WFST;
56 
57 /** an internal class for \Ref{EST_WFST} for representing transitions
58  in an WFST
59  */
61  private:
62  float p_weight;
63  int p_state;
64  int p_in_symbol;
65  int p_out_symbol;
66  public:
69  { p_weight=t.p_weight; p_state=t.p_state;
70  p_in_symbol = t.p_in_symbol; p_out_symbol=t.p_out_symbol; }
71  EST_WFST_Transition(float w, int s, int i, int o)
72  { p_weight=w; p_state=s; p_in_symbol=i; p_out_symbol=o;}
73  ~EST_WFST_Transition() { };
74  float weight() const { return p_weight; }
75  int state() const { return p_state; }
76  int in_symbol() const { return p_in_symbol; }
77  int out_symbol() const { return p_out_symbol; }
78  void set_weight(float f) { p_weight = f; }
79  void set_state(int s) { p_state = s; }
80 
81 };
83 
84 enum wfst_state_type {wfst_final, wfst_nonfinal, wfst_error, wfst_licence};
85 /** I'd like to use the enums but I need to binary read/write them **/
86 /** I don't believe that's portable so we need to have ints for these **/
87 #define WFST_FINAL 0
88 #define WFST_NONFINAL 1
89 #define WFST_ERROR 2
90 #define WFST_LICENCE 3
91 
92 
93 /** an internal class for \Ref{EST_WFST} used to represent a
94  state in a WFST
95 */
97  private:
98  int p_name;
99  enum wfst_state_type p_type;
100  int p_tag; // for marking in traversing
101  public:
102  wfst_translist transitions;
103 
104  EST_WFST_State(int name);
105  EST_WFST_State(const EST_WFST_State &state);
106  ~EST_WFST_State();
107 
108  EST_WFST_Transition *add_transition(float w,
109  int end,
110  int in,
111  int out);
112  int name() const { return p_name; }
113  int num_transitions() const { return transitions.length(); }
114  enum wfst_state_type type() const { return p_type; }
115  void set_type(wfst_state_type t) { p_type = t; }
116  void set_tag(int v) { p_tag = v;}
117  int tag() const { return p_tag;}
118 };
120 
122 enum wfst_mstate_type {wfst_ms_set, wfst_ms_list};
123 
124 /** an internal class to \Ref{EST_WFST} used in holding multi-states
125  when determinizing and find the intersections of other WFSTs.
126  */
128  private:
129  int p_name;
130  float p_weight;
131  enum wfst_mstate_type p_type;
132  public:
134  { p_name = -1; p_weight = 0.0; p_type = wfst_ms_set; }
135  EST_WFST_MultiState(enum wfst_mstate_type ty) : EST_IList()
136  { p_name = -1; p_weight = 0.0; p_type = ty; }
137  int name() const { return p_name; }
138  void set_name(int i) { p_name = i; }
139  float weight() const { return p_weight; }
140  void set_weight(float w) { p_weight = w; }
141  void set_type(enum wfst_mstate_type s) { p_type = s; }
142  enum wfst_mstate_type type() const { return p_type; }
143  void add(int i);
144 };
145 
146 int multistate_index(EST_WFST_MultiStateIndex &i,EST_WFST_MultiState *ms);
147 
148 /** a call representing a weighted finite-state transducer
149  */
150 class EST_WFST {
151  private:
152  EST_Discrete p_in_symbols;
153  EST_Discrete p_out_symbols;
154  int p_start_state;
155  int current_tag;
156  int p_num_states;
157  int p_cumulate;
158  wfst_state_vector p_states;
159 
160  int operator_and(LISP l);
161  int operator_or(LISP l);
162  int operator_star(LISP l);
163  int operator_plus(LISP l);
164  int operator_optional(LISP l);
165  int operator_not(LISP l);
166  int terminal(LISP l);
167  EST_WFST_State *copy_and_map_states(const EST_IVector &state_map,
168  const EST_WFST_State *s,
169  const EST_WFST &b) const;
170  void extend_alphabets(const EST_WFST &b);
171  int deterministiconstartstates(const EST_WFST &a, const EST_WFST &b) const;
172  EST_read_status load_transitions_from_lisp(int s, LISP trans);
173  void more_states(int new_max);
174 
175  int can_reach_final(int state);
176  static int traverse_tag;
177  public:
178  /**@name Constructor and initialisation functions */
179  //@{
180  /// ?
181  EST_WFST();
182  /// ?
183  EST_WFST(const EST_WFST &wfst) { p_num_states = 0; copy(wfst); }
184  ~EST_WFST();
185  //@}
186 
187  /**@name Reseting functions */
188  //@{
189  /// Clear with (estimation of number of states required)
190  void init(int init_num_states=10);
191  /// clear an initialise with given input and out alphabets
192  void init(LISP in, LISP out);
193  /// Copy from existing wfst
194  void copy(const EST_WFST &wfst);
195  /// clear removing existing states if any
196  void clear();
197  //@}
198 
199  /**@name General utility functions */
200  //@{
201  int num_states() const { return p_num_states; }
202  int start_state() const { return p_start_state; }
203  /// Map input symbol to input alphabet index
204  int in_symbol(const EST_String &s) const
205  { return p_in_symbols.name(s); }
206  /// Map input alphabet index to input symbol
207  const EST_String &in_symbol(int i) const
208  { return p_in_symbols.name(i); }
209  /// Map output symbol to output alphabet index
210  int out_symbol(const EST_String &s) const
211  { return p_out_symbols.name(s); }
212  /// Map output alphabet index to output symbol
213  const EST_String &out_symbol(int i) const
214  { return p_out_symbols.name(i); }
215  /// LISP for on epsilon symbols
216  LISP epsilon_label() const { return rintern("__epsilon__"); }
217  /// Internal index for input epsilon
218  int in_epsilon() const { return p_in_symbols.name("__epsilon__"); }
219  /// Internal index for output epsilon
220  int out_epsilon() const { return p_out_symbols.name("__epsilon__"); }
221  /// Return internal state information
222  const EST_WFST_State *state(int i) const { return p_states(i); }
223  /// Return internal state information (non-const)
224  EST_WFST_State *state_non_const(int i) { return p_states(i); }
225  /// True if state {\tt i} is final
226  int final(int i) const
227  { return ((i != WFST_ERROR_STATE) && (state(i)->type() == wfst_final));}
228  /// Accessing the input alphabet
229  const EST_Discrete &in_symbols() const { return p_in_symbols; }
230  /// Accessing the output alphabet
231  const EST_Discrete &out_symbols() const { return p_out_symbols; }
232 
233  //@}
234 
235  /**@name file i/o */
236  //@{
237  /// ?
238  EST_write_status save(const EST_String &filename,
239  const EST_String type = "ascii");
240  EST_write_status save_binary(FILE *fd);
241  /// ?
242  EST_read_status load(const EST_String &filename);
243 
244  EST_read_status load_binary(FILE *fd,
245  EST_Option &hinfo,
246  int num_states,
247  int swap);
248  //@}
249 
250  /**@name transduction functions */
251  //@{
252  /// Find (first) new state given in and out symbols
253  int transition(int state,int in, int out) const;
254  int transition(int state,int in, int out, float &prob) const;
255  /// Find (first) transition given in and out symbols
256  EST_WFST_Transition *find_transition(int state,int in, int out) const;
257  /// Find (first) new state given in and out strings
258  int transition(int state,const EST_String &in,const EST_String &out) const;
259  /// Find (first) new state given in/out string
260  int transition(int state,const EST_String &inout) const;
261  /// Transduce in to out from state
262  int transduce(int state,int in,int &out) const;
263  /// Transduce in to out (strings) from state
264  int transduce(int state,const EST_String &in,EST_String &out) const;
265  /// Transduce in to list of transitions
266  void transduce(int state,int in,wfst_translist &out) const;
267  /// Find all possible transitions for given state/input/output
268  void transition_all(int state,int in, int out,
269  EST_WFST_MultiState *ms) const;
270 
271  //@}
272 
273  /**@name Cumulation functions for adding collective probabilities
274  for transitions from data */
275  //@{
276  /// Cumulation condition
277  int cumulate() const {return p_cumulate;}
278  /// Clear and start cumulation
279  void start_cumulate();
280  /// Stop cumulation and calculate probabilities on transitions
281  void stop_cumulate();
282  //@}
283 
284  /**@name WFST construction functions from external representations **/
285  //@{
286  /// Add a new state, returns new name
287  int add_state(enum wfst_state_type state_type);
288  /// Given a multi-state return type (final, ok, error)
289  enum wfst_state_type ms_type(EST_WFST_MultiState *ms) const;
290 
291  /// Basic regex constructor
292  void build_wfst(int start, int end,LISP regex);
293  /// Basic conjunction constructor
294  void build_and_transition(int start, int end, LISP conjunctions);
295  /// Basic disjunction constructor
296  void build_or_transition(int start, int end, LISP disjunctions);
297 
298  // from standard REGEX
299  void build_from_regex(LISP inalpha, LISP outalpha, LISP regex);
300  // Kay/Kaplan/Koskenniemi rule compile
301  void kkrule_compile(LISP inalpha, LISP outalpha, LISP fp,
302  LISP rule, LISP sets);
303  // Build from regular (or pseudo-CF) grammar
304  void build_from_rg(LISP inalpha, LISP outalpha,
305  LISP distinguished, LISP rewrites,
306  LISP sets, LISP terms,
307  int max_depth);
308  // Build simple tree lexicon
309  void build_tree_lex(LISP inalpha, LISP outalpha,
310  LISP wlist);
311  //@}
312 
313  /**@name Basic WFST operators */
314  //@{
315  /// Build determinized form of a
316  void determinize(const EST_WFST &a);
317  /// Build minimized form of a
318  void minimize(const EST_WFST &a);
319  /// Build complement of a
320  void complement(const EST_WFST &a);
321  /** Build intersection of all WFSTs in given list. The new WFST
322  recognizes the only the strings that are recognized by all WFSTs
323  in the given list */
325  /** Build intersection of WFSTs a and b The new WFST
326  recognizes the only the strings that are recognized by both a
327  and b list */
328  void intersection(const EST_WFST &a, const EST_WFST &b);
329  /** Build union of all WFSTs in given list. The new WFST
330  recognizes the only the strings that are recognized by at least
331  one WFSTs in the given list */
332  void uunion(EST_TList<EST_WFST> &wl);
333  /** Build union of WFSTs a and b. The new WFST
334  recognizes the only the strings that are recognized by either
335  a or b */
336  void uunion(const EST_WFST &a, const EST_WFST &b);
337  /** Build new WFST by composition of a and b. Outputs of a are
338  fed to b, given a new WFSTs which has a's input language and b's
339  output set. a's output and b's input alphabets must be the same */
340  void compose(const EST_WFST &a,const EST_WFST &b);
341  /** Build WFST that accepts only strings in a that aren't also accepted
342  by strings in b. */
343  void difference(const EST_WFST &a,const EST_WFST &b);
344  /** Build WFST that accepts a language that consists of any string in
345  a followed by any string in b **/
346  void concat(const EST_WFST &a,const EST_WFST &b);
347  //@}
348 
349  /**@name construction support functions */
350  //@{
351  /// True if WFST is deterministic
352  int deterministic() const;
353  /// Transduce a multi-state given n and out
356  int in, int out) const;
357  /// Extend multi-state with epsilon reachable states
359  /// Remove error states from the WFST.
360  void remove_error_states(const EST_WFST &a);
361 
362  EST_String summary() const;
363  /// ?
364  EST_WFST & operator = (const EST_WFST &a) { copy(a); return *this; }
365 };
367 
368 // The basic operations on WFST
369 void minimize(EST_WFST &a,EST_WFST &b);
370 void determinize(EST_WFST &a,EST_WFST &b);
371 void concat(EST_WFST &a,EST_WFST &b,EST_WFST &c);
372 void compose(EST_WFST &a,EST_WFST &b,EST_WFST &c);
373 void intersect(wfst_list &wl,EST_WFST &wfst);
374 void complement(EST_WFST &a,EST_WFST &b);
375 void difference(EST_WFST &a,EST_WFST &b,EST_WFST &c);
376 void concatenate(EST_WFST &a,EST_WFST &b,EST_WFST &c);
377 
378 // Compile a set of KK rules
379 void kkcompile(LISP ruleset, EST_WFST &all_wfst);
380 // Compile a set of LTS rules
381 void ltscompile(LISP lts_rules, EST_WFST &all_wfst);
382 // Compile a regular grammar
383 void rgcompile(LISP rg, EST_WFST &all_wfst);
384 // Compile a tree lexicon
385 void tlcompile(LISP rg, EST_WFST &all_wfst);
386 
387 // Transduction and recognition functions
388 int transduce(const EST_WFST &wfst,const EST_StrList &in,EST_StrList &out);
389 int transduce(const EST_WFST &wfst,const EST_IList &in,EST_IList &out);
390 int recognize(const EST_WFST &wfst,const EST_StrList &in,int quiet);
391 int recognize(const EST_WFST &wfst,const EST_IList &in,
392  const EST_IList &out,int quite);
393 int recognize_for_perplexity(const EST_WFST &wfst,
394  const EST_StrList &in,
395  int quiet,
396  float &count,
397  float &sumlogp);
398 int recognize_for_perplexity(const EST_WFST &wfst,
399  const EST_IList &in,
400  const EST_IList &out,
401  int quiet,
402  float &count,
403  float &sumlogp);
404 
405 VAL_REGISTER_CLASS_DCLS(wfst,EST_WFST)
406 
407 #endif