Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
ltscompile.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : December 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A LTS rule compiler, where rules are contextual rewrite rules. Rules */
38 /* are of the for LC [ x ] RC => y where LC and RC are regexs on the */
39 /* tape only and x and y are simple strings on symbols. That is the */
40 /* standard form of LTS rules used in Festival. */
41 /* */
42 /*=======================================================================*/
43 #include <iostream>
44 #include "EST_cutils.h"
45 #include "EST_WFST.h"
46 
47 static LISP lts_find_feasible_pairs(LISP rules);
48 static LISP make_fp(LISP in, LISP out);
49 static LISP find_outs(LISP rule);
50 static LISP find_ins(LISP rule);
51 static LISP add_alpha(LISP n, LISP existing);
52 static LISP lts_find_alphabets(LISP rules);
53 static void ltsrule_compile(LISP inalpha, LISP outalpha,
54  LISP fp, LISP sets, LISP rule,
55  EST_WFST &a, EST_WFST &not_a);
56 static LISP analyse_rule(LISP rule);
57 static LISP expand_sets(LISP l, LISP fp, LISP sets);
58 static LISP expand_set(LISP p, LISP fp, LISP sets);
59 static LISP find_notMAP(LISP MAP,LISP fp);
60 
61 void ltscompile(LISP lts_rules, EST_WFST &all_wfst)
62 {
63  // Build a transducer from given LTS rules. Because the interpretation
64  // of these rules is normally ordered and the WFST is not, the
65  // complement of each cumulative WFST must be generated before
66  // adding the next rule
67  LISP r;
68  LISP fp; // feasible pairs, those pairs with rules (rather than IxO)
69  LISP inalpha,outalpha,alphas;
70  LISP sets=siod_nth(2,lts_rules);
71  LISP rules=siod_nth(3,lts_rules);
72  EST_WFST nots;
73 
74  alphas = lts_find_alphabets(lts_rules);
75  inalpha = car(alphas);
76  outalpha = cdr(alphas);
77 
78  fp = lts_find_feasible_pairs(rules);
79 
80  // set up an empty WFST, accepts nothing
81  all_wfst.build_from_regex(inalpha,outalpha,NIL);
82  // matches things not matched by rules, everything to begin with
83  nots.build_from_regex(inalpha,outalpha,
84  cons(rintern("*"),
85  cons(cons(rintern("or"),fp),NIL)));
86  nots.save("-");
87 
88  for (r=rules; r != NIL; r=cdr(r))
89  {
90  EST_WFST a, not_a,b,c,d;
91 
92  // all = all u (all' n r)
93  ltsrule_compile(inalpha,outalpha,fp,sets,car(r),a,not_a);
94  pprint(car(r));
95  a.save("-");
96  c.intersection(a,nots);
97  c.save("-");
98 
99  // Add for next rule
100  b.uunion(nots,not_a);
101  not_a.save("-");
102  b.save("-");
103  nots = b;
104 
105  d.uunion(all_wfst,c);
106  all_wfst = d;
107  all_wfst.save("-");
108  }
109 }
110 
111 static LISP lts_find_alphabets(LISP rules)
112 {
113  // Find the alphabets used in the rules
114  LISP r;
115  LISP in=NIL, out=NIL;
116 
117  for (r=siod_nth(3,rules); r != NIL; r=cdr(r))
118  {
119  in = add_alpha(find_ins(car(r)),in);
120  out = add_alpha(find_outs(car(r)),out);
121  }
122 
123  return cons(in,out);
124 }
125 
126 static LISP add_alpha(LISP n, LISP existing)
127 {
128  // Add values in n if not already in existing
129  LISP t;
130  LISP e=existing;
131 
132  for (t=n; t != NIL; t=cdr(t))
133  if (!siod_member_str(get_c_string(car(t)),e))
134  e = cons(car(t),e);
135 
136  return e;
137 }
138 
139 static LISP find_ins(LISP rule)
140 {
141  // find all symbols in [] in rule
142  LISP c;
143  int state=FALSE;
144  LISP ins = NIL;
145 
146  for (c=rule; c != NIL; c=cdr(c))
147  {
148  if (streq("[",get_c_string(car(c))))
149  state=TRUE;
150  else if (streq("]",get_c_string(car(c))))
151  break;
152  else if (state)
153  ins = cons(car(c),ins);
154  }
155  return reverse(ins);
156 }
157 
158 static LISP find_outs(LISP rule)
159 {
160  // find all symbols after = rule
161  LISP c;
162  int state=FALSE;
163  LISP outs = NIL;
164 
165  for (c=rule; c != NIL; c=cdr(c))
166  {
167  if (streq("=",get_c_string(car(c))))
168  state=TRUE;
169  else if (state)
170  outs = cons(car(c),outs);
171  }
172  return reverse(outs);
173 }
174 
175 static LISP lts_find_feasible_pairs(LISP rules)
176 {
177  // Find the set of pairs that have rules associated with them
178  // This effectively defines the transducer alphabet.
179  // We take the surface part in [] and the part after the = and
180  // linearly match them to form a set of pairs, padded with epsilon
181  // if necessary.
182  LISP fp = NIL;
183  LISP r;
184 
185  for (r=rules; r != NIL; r=cdr(r))
186  {
187  LISP in = find_ins(car(r));
188  LISP out = find_outs(car(r));
189 
190  LISP pairs = make_fp(in,out);
191 
192  for (LISP p=pairs; p != NIL; p=cdr(p))
193  {
194  if (!siod_member_str(get_c_string(car(p)),fp))
195  fp = cons(car(p),fp);
196  }
197  }
198  return fp;
199 }
200 
201 static LISP make_fp(LISP in, LISP out)
202 {
203  // Returns a list of pairs by matching each member of in to out
204  // padding the shorted one with _epsilon_ if necessary
205  LISP i,o;
206  LISP fp=NIL;
207  EST_String is,os;
208  int m;
209 
210  if (siod_llength(in) > siod_llength(out))
211  m = siod_llength(in);
212  else
213  m = siod_llength(out);
214 
215  for (i=in,o=out ; m > 0; --m,i=cdr(i),o=cdr(o))
216  {
217  if (i == NIL)
218  is = "__epsilon__";
219  else
220  is = get_c_string(car(i));
221  if (o == NIL)
222  os = "__epsilon__";
223  else
224  os = get_c_string(car(o));
225  fp = cons(strintern(is+"/"+os),fp);
226  }
227  return reverse(fp);
228 }
229 
230 static void ltsrule_compile(LISP inalpha, LISP outalpha,
231  LISP fp, LISP sets, LISP rule,
232  EST_WFST &a, EST_WFST &not_a)
233 {
234  // Return two regexs, one matching with rewrites and another
235  // that matches things this rule doesn't match.
236  LISP LC,MAP,RC,notMAP,r;
237 
238  r = analyse_rule(rule);
239  LC = siod_nth(0,r);
240  MAP = siod_nth(1,r);
241  RC = siod_nth(2,r);
242 
243  LC = expand_sets(LC,fp,sets);
244  RC = expand_sets(RC,fp,sets);
245  notMAP = find_notMAP(MAP,fp);
246 
247 
248  LISP kk = cons(LC,cons(MAP,cons(RC,NIL)));
249  cout << "kk rule" << endl;
250  pprint(kk);
251  a.kkrule_compile(inalpha,outalpha,fp,kk,NIL);
252 
253  // (or (* <fp>) (not <rule>)) ;; everything except the rule
254  LISP regex_r = cons(rintern("and"),append(LC,append(MAP,RC)));
255 // LISP nn = cons(rintern("or"),
256 // cons(cons(rintern("*"),cons(cons(rintern("or"),fp),NIL)),
257 // cons(cons(rintern("not"),cons(regex_r,NIL)),
258 // NIL)));
259  LISP nn = cons(rintern("not"),cons(regex_r,NIL));
260  not_a.build_from_regex(inalpha,outalpha,nn);
261 
262 }
263 
264 static LISP analyse_rule(LISP rule)
265 {
266  // return the left context, map and right context;
267  LISP LC=NIL, RC=NIL, in=NIL, out=NIL;
268  LISP l;
269  int state=0;
270 
271  for (l=rule; l != NIL; l=cdr(l))
272  {
273  if ((state==0) && (!streq("[",get_c_string(car(l)))))
274  LC = cons(car(l),LC);
275  else if ((state==0) && (streq("[",get_c_string(car(l)))))
276  state = 1;
277  else if ((state==1) && (!streq("]",get_c_string(car(l)))))
278  in = cons(car(l),in);
279  else if ((state==1) && (streq("]",get_c_string(car(l)))))
280  state = 2;
281  else if ((state==2) && (!streq("=",get_c_string(car(l)))))
282  RC = cons(car(l),RC);
283  else if ((state==2) && (streq("=",get_c_string(car(l)))))
284  state = 3;
285  else if (state == 3)
286  {
287  out = l;
288  break;
289  }
290  }
291 
292  return cons(reverse(LC),
293  cons(make_fp(reverse(in),out),
294  cons(reverse(RC),NIL)));
295 
296 }
297 
298 static LISP expand_sets(LISP l, LISP fp, LISP sets)
299 {
300  // Expand sets in l and fix regex characters
301  LISP r,es=NIL;
302 
303  for (r=l; r != NIL; r=cdr(r))
304  {
305  LISP s = expand_set(car(r),fp,sets);
306  if (cdr(r) && (streq("*",get_c_string(car(cdr(r))))))
307  {
308  es = cons(cons(rintern("*"),s),es);
309  r=cdr(r);
310  }
311  else if (cdr(r) && (streq("+",get_c_string(car(cdr(r))))))
312  {
313  es = cons(cons(rintern("+"),s),es);
314  r=cdr(r);
315  }
316  else
317  es = cons(cons(rintern("and"),s),es);
318  }
319  return reverse(es);
320 }
321 
322 static LISP expand_set(LISP p, LISP fp, LISP sets)
323 {
324  // expand p with respect to sets and feasible pairs
325  LISP set = siod_assoc_str(get_c_string(p),sets);
326  LISP s,f;
327  LISP r=NIL;
328 
329  if (set == NIL)
330  set = cons(p,NIL);
331 
332  for (s=set; s != NIL; s=cdr(s))
333  {
334  for (f=fp; f != NIL; f=cdr(f))
335  {
336  EST_String ss = get_c_string(car(s));
337  EST_String sf = get_c_string(car(f));
338 
339  if (sf.contains(ss+"/"))
340  r = cons(car(f),r);
341  }
342  }
343 
344  return reverse(r);
345 }
346 
347 static LISP find_notMAP(LISP MAP,LISP fp)
348 {
349  // Returns REGEX that matches everything except MAP, this doesn't
350  // try all possible epsilons though
351  LISP r,notrp=NIL,m,np;
352  EST_String s,l,p,sr,lr,rr;
353 
354  for (m=MAP; m != NIL; m=cdr(m))
355  {
356  p = get_c_string(car(m));
357  if (p.contains("/"))
358  {
359  s = p.before("/");
360  l = p.after("/");
361  }
362  else
363  {
364  s = p;
365  l = p;
366  }
367 
368  for (np=NIL,r=fp; r != NIL; r = cdr(r))
369  {
370  rr = get_c_string(car(r));
371  if (rr.contains("/"))
372  {
373  sr = rr.before("/");
374  lr = rr.after("/");
375  }
376  else
377  {
378  sr = rr;
379  lr = rr;
380  }
381  if ((s == sr) && (l != lr))
382  np = cons(car(r),np);
383  }
384  notrp = cons(cons(rintern("or"),np),notrp);
385  }
386 
387  return reverse(notrp);
388 }
389