Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
apml.cc
1  /************************************************************************/
2  /* */
3  /* Centre for Speech Technology Research */
4  /* University of Edinburgh, UK */
5  /* Copyright (c) 2002 */
6  /* All Rights Reserved. */
7  /* */
8  /* Permission is hereby granted, free of charge, to use and distribute */
9  /* this software and its documentation without restriction, including */
10  /* without limitation the rights to use, copy, modify, merge, publish, */
11  /* distribute, sublicense, and/or sell copies of this work, and to */
12  /* permit persons to whom this work is furnished to do so, subject to */
13  /* the following conditions: */
14  /* 1. The code must retain the above copyright notice, this list of */
15  /* conditions and the following disclaimer. */
16  /* 2. Any modifications must be clearly marked as such. */
17  /* 3. Original authors' names are not deleted. */
18  /* 4. The authors' names are not used to endorse or promote products */
19  /* derived from this software without specific prior written */
20  /* permission. */
21  /* */
22  /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23  /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24  /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25  /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26  /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27  /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28  /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29  /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30  /* THIS SOFTWARE. */
31  /* */
32  /*************************************************************************/
33  /* */
34  /* Author: Rob Clark (robert@cstr.ed.ac.uk) */
35  /* -------------------------------------------------------------------- */
36  /* Code to read APML format XML as utterances. */
37  /* */
38  /*************************************************************************/
39 
40 #include <cstdlib>
41 #include <cstdio>
42 #include "EST_THash.h"
43 #include "EST_error.h"
44 #include "apml.h"
45 #include "rxp/XML_Parser.h"
46 
47 static EST_Regex simpleIDRegex(".*#id(w\\([0-9]+\\))");
48 static EST_Regex rangeIDRegex(".*#id(w\\([0-9]+\\)).*id(w\\([0-9]+\\))");
49 static EST_Regex RXpunc("[\\.,\\?\\!\"]+");
50 
51 class Parse_State
52  {
53 public:
54  int depth;
55  int maxid;
56  EST_Utterance *utt;
57  EST_Relation *tokens;
58  EST_Relation *perf;
59  EST_Relation *com;
60  EST_Relation *semstruct;
61  EST_Relation *emphasis;
62  EST_Relation *boundary;
63  EST_Relation *pause;
64  EST_Item *parent;
65  EST_Item *pending;
66  EST_Item *last_token;
67  };
68 
69 class Apml_Parser_Class : public XML_Parser_Class
70 {
71 protected:
72  virtual void document_open(XML_Parser_Class &c,
73  XML_Parser &p,
74  void *data);
75  virtual void document_close(XML_Parser_Class &c,
76  XML_Parser &p,
77  void *data);
78 
79  virtual void element_open(XML_Parser_Class &c,
80  XML_Parser &p,
81  void *data,
82  const char *name,
83  XML_Attribute_List &attributes);
84  virtual void element(XML_Parser_Class &c,
85  XML_Parser &p,
86  void *data,
87  const char *name,
88  XML_Attribute_List &attributes);
89  virtual void element_close(XML_Parser_Class &c,
90  XML_Parser &p,
91  void *data,
92  const char *name);
93 
94  virtual void pcdata(XML_Parser_Class &c,
95  XML_Parser &p,
96  void *data,
97  const char *chars);
98  virtual void cdata(XML_Parser_Class &c,
99  XML_Parser &p,
100  void *data,
101  const char *chars);
102 
103  virtual void processing(XML_Parser_Class &c,
104  XML_Parser &p,
105  void *data,
106  const char *instruction);
107  virtual void error(XML_Parser_Class &c,
108  XML_Parser &p,
109  void *data);
110 };
111 
112 static void print_attributes(XML_Attribute_List &attributes)
113 {
115 
116  for(them.begin(attributes); them ; them++)
117  printf(" %s='%s'",
118  (const char *)them->k,
119  (const char *)them->v);
120 }
121 
122 EST_read_status apml_read(FILE *file,
123  const EST_String &name,
124  EST_Utterance &u,
125  int &max_id)
126 {
127  (void)max_id;
128  (void)print_attributes; // just to shut -Wall up.
129  Apml_Parser_Class pclass;
130  Parse_State state;
131 
132  u.clear();
133 
134  state.utt=&u;
135 
136  XML_Parser *parser = pclass.make_parser(file, name, &state);
137  parser->track_context(TRUE);
138 
139  CATCH_ERRORS()
140  return read_format_error;
141 
142  parser->go();
143 
144  END_CATCH_ERRORS();
145 
146  return read_ok;
147 }
148 
149 
150 
151 /** Now we define the callbacks.
152  */
153 
154 void Apml_Parser_Class::document_open(XML_Parser_Class &c,
155  XML_Parser &p,
156  void *data)
157 {
158  (void)c; (void)p;
159  Parse_State *state = (Parse_State *)data;
160 
161  state->maxid=0;
162 
163  state->depth=1;
164  state->parent=NULL;
165  state->pending=NULL;
166  state->last_token=NULL;
167 
168  // create relations:
169  state->perf = state->utt->create_relation("Perfomative");
170  state->com = state->utt->create_relation("Communicative");
171  state->tokens = state->utt->create_relation("Token");
172  state->semstruct = state->utt->create_relation("SemStructure");
173  state->emphasis = state->utt->create_relation("Emphasis");
174  state->boundary = state->utt->create_relation("Boundary");
175  state->pause = state->utt->create_relation("Pause");
176 
177 
178 }
179 
180 void Apml_Parser_Class::document_close(XML_Parser_Class &c,
181  XML_Parser &p,
182  void *data)
183 {
184  (void)c; (void)p; (void)data;
185 }
186 
187 
188 void Apml_Parser_Class::element_open(XML_Parser_Class &c,
189  XML_Parser &p,
190  void *data,
191  const char *name,
192  XML_Attribute_List &attributes)
193 {
194  (void)c; (void)p; (void)attributes;
195  Parse_State *state = (Parse_State *)data;
196 
197  //cout << " In element_open: " << name << "\n";
198 
199  if (strcmp(name, "turnallocation")==0)
200  {
201  // currently ignore
202  return;
203  }
204 
205  if (strcmp(name, "apml")==0)
206  return; // ignore
207 
208  state->depth++;
209 
210  if( strcmp(name, "performative")==0
211  || strcmp(name, "rheme")==0
212  || strcmp(name, "theme")==0
213  || strcmp(name, "emphasis")==0
214  || strcmp(name, "boundary")==0
215  || strcmp(name, "pause")==0)
216  {
217 
218  // create new item content
219  EST_Item_Content *cont = new EST_Item_Content();
220  cont->set_name(name);
221 
223  for(them.begin(attributes); them ; them++)
224  {
225  EST_String k = them->k;
226  EST_String v = them->v;
227  cont->f.set(k,v);
228  }
229 
230  EST_Item *item;
231 
232  if( strcmp(name, "emphasis")==0 )
233  {
234  item = state->emphasis->append();
235  state->pending = item;
236  }
237  else if(strcmp(name, "boundary")==0 )
238  {
239  item = state->boundary->append();
240  if(state->last_token)
241  item->append_daughter(state->last_token);
242  }
243  else if(strcmp(name, "pause")==0 )
244  {
245  item = state->pause->append();
246  if(state->last_token)
247  item->append_daughter(state->last_token);
248  }
249  else
250  {
251  if (state->parent == NULL)
252  item = state->semstruct->append();
253  else
254  item = state->parent->append_daughter();
255  state->parent=item;
256  }
257 
258  item->set_contents(cont);
259 
260 
261  }
262  else
263  EST_warning("APML Parser: unknown element %s", name);
264 }
265 
266 
267 void Apml_Parser_Class::element(XML_Parser_Class &c,
268  XML_Parser &p,
269  void *data,
270  const char *name,
271  XML_Attribute_List &attributes)
272 {
273  (void)c; (void)p; (void)attributes;
274 
275  element_open(c, p, data, name, attributes);
276  element_close(c, p, data, name);
277 }
278 
279 
280 void Apml_Parser_Class::element_close(XML_Parser_Class &c,
281  XML_Parser &p,
282  void *data,
283  const char *name)
284 {
285  (void)c; (void)p; (void)name;
286  Parse_State *state = (Parse_State *)data;
287 
288  if ( strcmp(name, "emphasis")==0
289  || strcmp(name, "boundary")==0
290  || strcmp(name, "pause")==0 )
291  {
292  state->depth--;
293  state->pending=NULL;
294  }
295 
296 
297  if (strcmp(name, "performative")==0
298  || strcmp(name, "theme")==0
299  || strcmp(name, "rheme")==0)
300  {
301  state->depth--;
302  state->pending = NULL;
303  state->parent=state->parent->up();
304  }
305 }
306 
307 
308 void Apml_Parser_Class::pcdata(XML_Parser_Class &c,
309  XML_Parser &p,
310  void *data,
311  const char *chars)
312 {
313  (void)c;
314 
315  Parse_State *state = (Parse_State *)data;
316  EST_String strings[255];
317 
318  split(chars,strings,255,RXwhite);
319 
320  // for(int cc=0 ; cc < 20 ; ++cc)
321  // cout << cc << ": \"" << strings[cc] << "\" (" << strings[cc].length() << ")\n";
322 
323  int s=0;
324 
325  while( s < 1 || strings[s].length() > 0 )
326  {
327  if(strings[s].length() > 0 )
328  {
329  // Just Punctuation
330  if(strings[s].matches(RXpunc))
331  {
332  state->last_token->set("punc",strings[s]);
333  }
334  // Text and possibly punc
335  else
336  {
337  EST_Item_Content *cont = new EST_Item_Content();
338  EST_Item *item;
339 
340  if (state->parent == NULL)
341  item = state->semstruct->append();
342  else
343  item = state->parent->append_daughter();
344  item->set_contents(cont);
345 
346  // strip pre-punc here.
347  int i = strings[s].index(RXpunc);
348  EST_String ps = strings[s].at(RXpunc);
349  EST_String intermediate;
350  if( ps.length() > 0 && i == 0)
351  {
352  cout << "Got pre punc: " << ps << endl;
353  intermediate = strings[s].after(RXpunc);
354  // cont->set_name(strings[s].before(RXpunc));
355  item->set("prepunctuation",ps);
356  }
357  else
358  {
359  intermediate = strings[s];
360  item->set("prepunctuation","");
361  }
362  // now strip punc
363  ps = intermediate.at(RXpunc);
364  if( ps.length() > 0 )
365  {
366  cout << "Got punc: " << ps << endl;
367  cont->set_name(intermediate.before(RXpunc));
368  item->set("punc",ps);
369  }
370  else
371  {
372  cont->set_name(intermediate);
373  item->set("punc","");
374  }
375 
376  state->tokens->append(item);
377  state->last_token = item;
378 
379  if(state->pending)
380  {
381  state->pending->append_daughter(item);
382  }
383 
384  // if (state->parent != NULL && p.context(0) == "w")
385  // state->parent->set(EST_String("token"), chars);
386 
387  //cout << " got token: " << item->name() << "\n";
388  }
389  }
390  ++s;
391  }
392 }
393 
394 
395 void Apml_Parser_Class::cdata(XML_Parser_Class &c,
396  XML_Parser &p,
397  void *data,
398  const char *chars)
399 {
400  (void)c; (void)p; (void)data; (void)chars;
401  // Parse_State *state = (Parse_State *)data;
402 
403  // printf("APML XML Parser [cdata[%s]] %d\n", chars, state->depth);
404 }
405 
406 
407 void Apml_Parser_Class::processing(XML_Parser_Class &c,
408  XML_Parser &p,
409  void *data,
410  const char *instruction)
411 {
412  (void)c; (void)p;
413  Parse_State *state = (Parse_State *)data;
414 
415  printf("APML XML Parser [proc[%s]] %d\n", instruction, state->depth);
416 }
417 
418 
420  XML_Parser &p,
421  void *data)
422 {
423  (void)c; (void)p; (void)data;
424  // Parse_State *state = (Parse_State *)data;
425 
426  EST_error("APML Parser %s", get_error(p));
427 
428  est_error_throw();
429 }
430 
431 
432 
433 
434 
435 
436