Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
kkcompile.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996-1998 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : December 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A Koskenniemi/Kay/Kaplan rule compiler to WFST using the techniques */
38 /* Ritchie et al.'s "Computational Morphology" (but followed through to */
39 /* make real WFSTs). */
40 /* */
41 /*=======================================================================*/
42 #include <iostream>
43 #include "EST_WFST.h"
44 #include "EST_cutils.h"
45 
46 ostream &operator << (ostream &s, const EST_WFST &w)
47 {
48  (void)w;
49  return s << "<<EST_WFST>>";
50 }
51 
52 Declare_TList(EST_WFST)
53 
54 #if defined(INSTANTIATE_TEMPLATES)
55 #include "../base_class/EST_TList.cc"
56 
57 Instantiate_TList(EST_WFST)
58 
59 #endif
60 
61 static LISP expand_fp(const EST_String p,LISP fp);
62 static LISP find_feasible_pairs(LISP rules);
63 static LISP all_but(LISP rulepair,LISP fp);
64 static LISP expand_sets(LISP sets,LISP fp);
65 static LISP inline_sets(LISP l, LISP sets);
66 static void full_kkcompile(LISP inalpha,LISP outalpha,
67  LISP fp, LISP rules, LISP sets,
68  EST_WFST &all_wfst);
69 
70 void kkcompile(LISP ruleset, EST_WFST &all_wfst)
71 {
72  // Build a transducer from given kkrule (Kay/Kaplan/Koskenniemi)
73  // Rules are of the form LeftContext Map RightContext
74 
75  // The WFST is recognizing all string except the rulepair unless
76  // its in the proper context.
77  LISP fp; // feasible pairs, those pairs with rules (rather than IxO)
78  LISP inalpha = siod_nth(1,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
79  LISP outalpha = siod_nth(2,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
80  LISP sets = cdr(siod_assoc_str("Sets",ruleset));
81  LISP rules = cdr(siod_assoc_str("Rules",ruleset));
82 
83  fp = find_feasible_pairs(rules);
84  sets = expand_sets(sets,fp);
85 
86  full_kkcompile(inalpha,outalpha,fp,rules,sets,all_wfst);
87 }
88 
89 static void full_kkcompile(LISP inalpha,LISP outalpha,
90  LISP fp, LISP rules, LISP sets,
91  EST_WFST &all_wfst)
92 {
93  wfst_list rulelist;
94  LISP r;
95 
96  for (r=rules; r != NIL; r=cdr(r))
97  {
98  EST_WFST r_wfst,base_wfst,det_wfst;
99  rulelist.append(r_wfst);
100  EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
101  cout << "Rule: " << siod_llength(rules)-siod_llength(r) << endl;
102  pprint(car(r));
103  base_wfst.kkrule_compile(inalpha,outalpha,fp,car(r),sets);
104  cout << " base " << base_wfst.summary() << endl;
105  det_wfst.determinize(base_wfst);
106  cout << " determinized " << det_wfst.summary() << endl;
107  rr_wfst.minimize(det_wfst);
108  cout << " minimized " << rr_wfst.summary() << endl;
109  }
110 
111  cout << "WFST: intersecting " << rulelist.length() << " rules" << endl;
112  EST_Litem *p,*nnp;
113  int i;
114  for (i=0,p=rulelist.head(); p->next() != 0; p=nnp)
115  {
116  EST_WFST r_wfst,base_wfst,det_wfst;
117  EST_WFST mmm;
118  rulelist.append(r_wfst);
119  EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
120  cout << "intersecting " << i << " and " << i+1 << " " <<
121  rulelist.length()-2 << " left" << endl;
122  cout << " " << rulelist(p).summary() << " and " << endl;
123  cout << " " << rulelist(p->next()).summary() << " becomes " << endl;
124  mmm.intersection(rulelist(p),rulelist(p->next()));
125  cout << " " << mmm.summary() << " minimizes to " << endl;
126  rr_wfst.minimize(mmm);
127  cout << " " << rr_wfst.summary() << endl;
128  nnp=p->next()->next();
129  i+=2;
130  rulelist.remove(p->next());
131  rulelist.remove(p);
132  }
133 
134  all_wfst = rulelist.first();
135 
136 }
137 
138 static LISP expand_sets(LISP sets,LISP fp)
139 {
140  // Expand sets into regexes that represent them. Single
141  // char values are converted to disjunctions of feasible pairs
142  // that have the same surface character
143  LISP s,es=NIL,e,ne;
144 
145  for (s=sets; s != NIL; s=cdr(s))
146  {
147  for (ne=NIL,e=cdr(car(s)); e != NIL; e=cdr(e))
148  {
149  EST_String ss = get_c_string(car(e));
150  if (ss.contains("/"))
151  ne = cons(car(e),ne);
152  else
153  ne = append(expand_fp(ss,fp),ne);
154  }
155  if (ne == NIL)
156  {
157  cerr << "WFST: kkcompile: set " << get_c_string(car(car(s))) <<
158  " has no feasible pairs" << endl;
159  }
160 
161  else if (siod_llength(ne) == 1)
162  es = cons(cons(car(car(s)),ne),es);
163  else
164  es = cons(cons(car(car(s)),
165  cons(cons(rintern("or"),reverse(ne)),
166  NIL)),es);
167  }
168 
169  return reverse(es);
170 }
171 
172 static LISP expand_fp(const EST_String p,LISP fp)
173 {
174  // Find all fp's that have this p as their surface char
175  LISP m=NIL,f;
176  EST_Regex rg(EST_String("^")+p+"/.*");
177 
178  for (f=fp; f != NIL; f=cdr(f))
179  {
180  EST_String ss = get_c_string(car(f));
181  if ((p == ss) || (ss.matches(rg)))
182  m = cons(car(f),m);
183  }
184  return m;
185 }
186 
187 static LISP find_feasible_pairs(LISP rules)
188 {
189  // Find the set of pairs that have rules associated with them
190  // This effectively defines the transducer alphabet.
191  LISP fp = NIL;
192  LISP r;
193 
194  for (r=rules; r != NIL; r=cdr(r))
195  {
196  if (siod_member_str(get_c_string(siod_nth(0,car(r))),fp) == NIL)
197  fp = cons(siod_nth(0,car(r)),fp);
198  }
199  return fp;
200 }
201 
202 static int surface_coercion(LISP rt)
203 {
204  return (streq("<=",get_c_string(rt)));
205 }
206 
207 static int context_restriction(LISP rt)
208 {
209  return (streq("=>",get_c_string(rt)));
210 }
211 
212 static int composite(LISP rt)
213 {
214  return (streq("<=>",get_c_string(rt)));
215 }
216 
217 static LISP inline_sets(LISP l, LISP sets)
218 {
219  // Replace any set name with the regex equivalent
220  LISP s;
221  if (l == NIL)
222  return NIL;
223  else if (consp(l))
224  return cons(inline_sets(car(l),sets),inline_sets(cdr(l),sets));
225  else if ((s=siod_assoc_str(get_c_string(l),sets)) != NIL)
226  return car(cdr(s));
227  else
228  return l;
229 }
230 
231 void EST_WFST::kkrule_compile(LISP inalpha, LISP outalpha, LISP fp,
232  LISP rule,LISP sets)
233 {
234  // Build a WFST to transduce this particular rule
235  // Accepts any other combination of feasible pairs too
236  LISP leftcontext = inline_sets(siod_nth(2,rule),sets);
237  LISP rulepair = siod_nth(0,rule);
238  LISP ruletype = siod_nth(1,rule);
239  LISP rightcontext = inline_sets(siod_nth(4,rule),sets);
240  LISP p;
241  int i;
242  int end_LC,end_RP,end_NOTRP,end_RC,err_state;
243 
244  // Initialize alphabets
245  init(inalpha,outalpha); // should be passed as discretes
246 
247  p_start_state = add_state(wfst_final); // empty WFST
248  // Add transitions for all pairs except rulepair
249  for (p=fp; p != NIL; p=cdr(p))
250  if ((!equal(rulepair,car(p))) ||
251  (surface_coercion(ruletype)))
252  build_wfst(p_start_state,p_start_state,car(p));
253 
254  // build for LC
255  if (leftcontext)
256  {
257  end_LC = add_state(wfst_final);
258  build_wfst(p_start_state,end_LC,leftcontext);
259  // for all states in LC mark final & add epsilon to p_start_state
260  for (i=end_LC; i < p_num_states; i++)
261  {
262  build_wfst(i,p_start_state,epsilon_label());
263  p_states[i]->set_type(wfst_final);
264  }
265  }
266  else // no LC
267  end_LC = p_start_state;
268 
269  // build for RP and RC from end_LC
270  if (composite(ruletype) || context_restriction(ruletype))
271  {
272  if (rightcontext)
273  {
274  end_RP = add_state(wfst_nonfinal);
275  build_wfst(end_LC,end_RP,rulepair);
276  // build for RC from end map to p_start_state
277  build_wfst(end_RP,p_start_state,rightcontext);
278  err_state = add_state(wfst_error);
279  for (i=end_RP; i < err_state; i++)
280  { // for everything other that the correct path go to err_state
281  // without this explicit error state the epsilon to start
282  // allows almost everything
283  if (transition(i,get_c_string(epsilon_label()))
284  != WFST_ERROR_STATE)
285  break; // not a state require extra transitions
286  for (p=fp; p != NIL; p=cdr(p))
287  if (transition(i,get_c_string(car(p))) == WFST_ERROR_STATE)
288  build_wfst(i,err_state,car(p));
289  build_wfst(i,p_start_state,epsilon_label());
290  p_states[i]->set_type(wfst_licence);
291  }
292  }
293  else // no RC, so end back at start
294  build_wfst(end_LC,p_start_state,rulepair);
295  }
296 
297  // Build for notRP and RC from end_LC
298  if (composite(ruletype) || surface_coercion(ruletype))
299  {
300  LISP abrp = all_but(rulepair,fp);
301  if (abrp)
302  {
303  if (rightcontext)
304  {
305  end_RC = add_state(wfst_error);
306  end_NOTRP = add_state(wfst_nonfinal);
307  build_wfst(end_LC,end_NOTRP,abrp);
308  // build for RC from end RP to error state
309  build_wfst(end_NOTRP,end_RC,rightcontext);
310  // for all states in RC except final one mark final & add
311  // epsilon to p_start_state
312  for (i=end_NOTRP; i < p_num_states; i++)
313  {
314  build_wfst(i,p_start_state,epsilon_label());
315  p_states[i]->set_type(wfst_final);
316  }
317  }
318  else // no RC,
319  {
320  end_RC = add_state(wfst_error);
321  build_wfst(end_LC,end_RC,abrp);
322  }
323  }
324  }
325 }
326 
327 static LISP all_but(LISP rulepair,LISP fp)
328 {
329  // Returns pairs that have the same surface symbol as rulepair
330  // but different lexical symbol
331  LISP r,notrp=NIL;
332  EST_String s,l,p,sr,lr,rr;
333 
334  p = get_c_string(rulepair);
335  if (p.contains("/"))
336  {
337  s = p.before("/");
338  l = p.after("/");
339  }
340  else
341  {
342  s = p;
343  l = p;
344  }
345 
346  for (r=fp; r != NIL; r = cdr(r))
347  {
348  rr = get_c_string(car(r));
349  if (rr.contains("/"))
350  {
351  sr = rr.before("/");
352  lr = rr.after("/");
353  }
354  else
355  {
356  sr = rr;
357  lr = rr;
358  }
359  if ((l != lr) && (s == sr))
360  notrp = cons(car(r),notrp);
361  }
362 
363  if (siod_llength(notrp) > 1)
364  notrp = cons(strintern("or"),notrp);
365  return notrp;
366 }
367 
368 void intersect(wfst_list &wl, EST_WFST &all)
369 {
370  // Intersect the wfst's in wl into all
371 
372  all.intersection(wl);
373 
374 }
375