Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
solexml.cc
1  /************************************************************************/
2  /* */
3  /* Centre for Speech Technology Research */
4  /* University of Edinburgh, UK */
5  /* Copyright (c) 1996,1997 */
6  /* All Rights Reserved. */
7  /* */
8  /* Permission is hereby granted, free of charge, to use and distribute */
9  /* this software and its documentation without restriction, including */
10  /* without limitation the rights to use, copy, modify, merge, publish, */
11  /* distribute, sublicense, and/or sell copies of this work, and to */
12  /* permit persons to whom this work is furnished to do so, subject to */
13  /* the following conditions: */
14  /* 1. The code must retain the above copyright notice, this list of */
15  /* conditions and the following disclaimer. */
16  /* 2. Any modifications must be clearly marked as such. */
17  /* 3. Original authors' names are not deleted. */
18  /* 4. The authors' names are not used to endorse or promote products */
19  /* derived from this software without specific prior written */
20  /* permission. */
21  /* */
22  /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23  /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24  /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25  /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26  /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27  /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28  /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29  /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30  /* THIS SOFTWARE. */
31  /* */
32  /*************************************************************************/
33  /* */
34  /* Author: Richard Caley (rjc@cstr.ed.ac.uk) */
35  /* -------------------------------------------------------------------- */
36  /* Code to reas SOLE format XML as utterances. */
37  /* */
38  /*************************************************************************/
39 
40 #include <cstdlib>
41 #include <cstdio>
42 #include "EST_THash.h"
43 #include "EST_error.h"
44 #include "solexml.h"
45 #include "rxp/XML_Parser.h"
46 
47 static EST_Regex simpleIDRegex(".*#id(w\\([0-9]+\\))");
48 static EST_Regex rangeIDRegex(".*#id(w\\([0-9]+\\)).*id(w\\([0-9]+\\))");
49 
50 class Parse_State
51  {
52 public:
53  int depth;
54  EST_String relName;
55  EST_Utterance *utt;
56  EST_Relation *rel;
57  EST_Item *parent;
58  EST_Item *current;
59 
61 
62  Parse_State() : contents(100) {}
63  };
64 
65 class Sole_Parser_Class : public XML_Parser_Class
66 {
67 protected:
68  virtual void document_open(XML_Parser_Class &c,
69  XML_Parser &p,
70  void *data);
71  virtual void document_close(XML_Parser_Class &c,
72  XML_Parser &p,
73  void *data);
74 
75  virtual void element_open(XML_Parser_Class &c,
76  XML_Parser &p,
77  void *data,
78  const char *name,
79  XML_Attribute_List &attributes);
80  virtual void element(XML_Parser_Class &c,
81  XML_Parser &p,
82  void *data,
83  const char *name,
84  XML_Attribute_List &attributes);
85  virtual void element_close(XML_Parser_Class &c,
86  XML_Parser &p,
87  void *data,
88  const char *name);
89 
90  virtual void pcdata(XML_Parser_Class &c,
91  XML_Parser &p,
92  void *data,
93  const char *chars);
94  virtual void cdata(XML_Parser_Class &c,
95  XML_Parser &p,
96  void *data,
97  const char *chars);
98 
99  virtual void processing(XML_Parser_Class &c,
100  XML_Parser &p,
101  void *data,
102  const char *instruction);
103  virtual void error(XML_Parser_Class &c,
104  XML_Parser &p,
105  void *data);
106 };
107 
108 static void print_attributes(XML_Attribute_List &attributes)
109 {
111 
112  for(them.begin(attributes); them ; them++)
113  printf(" %s='%s'",
114  (const char *)them->k,
115  (const char *)them->v);
116 }
117 
118 EST_read_status solexml_read(FILE *file,
119  const EST_String &name,
120  EST_Utterance &u,
121  int &max_id)
122 {
123  (void)max_id;
124  (void)print_attributes; // just to shut -Wall up.
125  Sole_Parser_Class pclass;
126  Parse_State state;
127 
128  u.clear();
129 
130  state.utt=&u;
131 
132  XML_Parser *parser = pclass.make_parser(file, name, &state);
133  parser->track_context(TRUE);
134 
135  CATCH_ERRORS()
136  return read_format_error;
137 
138  parser->go();
139 
140  END_CATCH_ERRORS();
141 
142  return read_ok;
143 }
144 
145 static void ensure_relation(Parse_State *state)
146 {
147  if (state->rel==NULL)
148  {
149  state->rel = state->utt->create_relation(state->relName);
150  }
151 }
152 
153 static EST_Item_Content *get_contents(Parse_State *state, EST_String id)
154 {
155  EST_Item_Content *c = state->contents.val(id);
156  if (c==NULL)
157  {
158  c = new EST_Item_Content();
159  state->contents.add_item(id, c);
160  }
161 
162  return c;
163 }
164 
165 static void extract_ids(XML_Attribute_List &attributes,
167 {
168  EST_String val;
169  static int count;
170  if (attributes.present("id"))
171  {
172  val = attributes.val("id");
173  ids.append(val);
174  }
175  else if (attributes.present("href"))
176  {
177  val = attributes.val("href");
178  int starts[EST_Regex_max_subexpressions];
179  int ends[EST_Regex_max_subexpressions];
180 
181  if (val.matches(simpleIDRegex, 0, starts, ends))
182  {
183  EST_String n = val.at(starts[1], ends[1]-starts[1]);
184 
185  ids.append("w" + n);
186  }
187  else if (val.matches(rangeIDRegex, 0, starts, ends))
188  {
189  int n1 = atoi(val.at(starts[1], ends[1]-starts[1]));
190  int n2 = atoi(val.at(starts[2], ends[2]-starts[2]));
191 
192  for(int i=n1; i<=n2; i++)
193  {
194  char buf[100];
195  sprintf(buf, "w%d", i);
196 
197  ids.append(buf);
198  }
199 
200  }
201  else
202  EST_warning("element with bad ID or HREF '%s'", (const char *)val);
203  }
204  else
205  {
206  char buf[100];
207  sprintf(buf, "n%d", ++count);
208 
209  ids.append(buf);
210  return;
211  }
212 
213 }
214 
215 
216 /** Now we define the callbacks.
217  */
218 
219 void Sole_Parser_Class::document_open(XML_Parser_Class &c,
220  XML_Parser &p,
221  void *data)
222 {
223  (void)c; (void)p;
224  Parse_State *state = (Parse_State *)data;
225 
226  state->depth=1;
227  state->rel=NULL;
228  state->parent=NULL;
229  state->current=NULL;
230 }
231 
232 void Sole_Parser_Class::document_close(XML_Parser_Class &c,
233  XML_Parser &p,
234  void *data)
235 {
236  (void)c; (void)p; (void)data;
237 }
238 
239 
240 void Sole_Parser_Class::element_open(XML_Parser_Class &c,
241  XML_Parser &p,
242  void *data,
243  const char *name,
244  XML_Attribute_List &attributes)
245 {
246  (void)c; (void)p; (void)attributes;
247  Parse_State *state = (Parse_State *)data;
248 
249  state->depth++;
250 
251  if (strcmp(name, "solexml")==0)
252  {
253  state->relName=attributes.val("relation");
254  printf("start solexml relation=%s\n", (const char *)state->relName);
255  return;
256  }
257  else if (strcmp(name, "text-elem")==0)
258  {
259  // ignore these
260  return;
261  }
262 
263  ensure_relation(state);
264 
265  if (strcmp(name, "anaphora-elem")==0
266  || strcmp(name, "wordlist")==0
267  || strcmp(name, "w")==0)
268  {
270  extract_ids(attributes, ids);
271 
272  EST_Litem *idp = ids.head();
273  bool first=TRUE;
274  for(; idp!= NULL; idp = idp->next())
275  {
276  EST_String id = ids(idp);
277  if (id==EST_String::Empty)
278  XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
279 
280  if (first)
281  first=FALSE;
282  else
283  {
284  state->current = state->parent;
285  state->parent=state->parent->up();
286  }
287 
288 
289  EST_Item_Content *cont = get_contents(state, id);
290 
291  cont->set_name(id);
292 
294  for(them.begin(attributes); them ; them++)
295  {
296  EST_String k = them->k;
297  EST_String v = them->v;
298  cont->f.set(k,v);
299  }
300 
301  EST_Item *item;
302 
303  if (state->current == NULL)
304  if (state->parent == NULL)
305  item = state->rel->append();
306  else
307  item = state->parent->insert_below();
308  else
309  item = state->current->insert_after();
310 
311  item->set_contents(cont);
312 
313  state->current=NULL;
314  state->parent=item;
315  }
316  }
317  else
318  EST_warning("SOLE XML Parser: unknown element %s", name);
319 }
320 
321 
322 void Sole_Parser_Class::element(XML_Parser_Class &c,
323  XML_Parser &p,
324  void *data,
325  const char *name,
326  XML_Attribute_List &attributes)
327 {
328  (void)c; (void)p; (void)attributes;
329  Parse_State *state = (Parse_State *)data;
330 
331  if (strcmp(name, "language")==0)
332  {
333  state->utt->f.set("language", attributes.val("name"));
334  return;
335  }
336 
337  element_open(c, p, data, name, attributes);
338  element_close(c, p, data, name);
339 }
340 
341 
342 void Sole_Parser_Class::element_close(XML_Parser_Class &c,
343  XML_Parser &p,
344  void *data,
345  const char *name)
346 {
347  (void)c; (void)p; (void)name;
348  Parse_State *state = (Parse_State *)data;
349 
350  if (strcmp(name, "anaphora-elem")==0
351  || strcmp(name, "wordlist")==0
352  || strcmp(name, "w")==0)
353  {
354  state->depth--;
355  state->current = state->parent;
356  state->parent=state->parent->up();;
357  }
358 }
359 
360 
361 void Sole_Parser_Class::pcdata(XML_Parser_Class &c,
362  XML_Parser &p,
363  void *data,
364  const char *chars)
365 {
366  (void)c;
367 
368  Parse_State *state = (Parse_State *)data;
369 
370  if (state->parent != NULL && p.context(0) == "w")
371  state->parent->set(EST_String("word"), chars);
372 
373  // printf("SOLE XML Parser [pcdata[%s]] %d\n", chars, state->depth);
374 }
375 
376 
377 void Sole_Parser_Class::cdata(XML_Parser_Class &c,
378  XML_Parser &p,
379  void *data,
380  const char *chars)
381 {
382  (void)c; (void)p; (void)data; (void)chars;
383  // Parse_State *state = (Parse_State *)data;
384 
385  // printf("SOLE XML Parser [cdata[%s]] %d\n", chars, state->depth);
386 }
387 
388 
389 void Sole_Parser_Class::processing(XML_Parser_Class &c,
390  XML_Parser &p,
391  void *data,
392  const char *instruction)
393 {
394  (void)c; (void)p;
395  Parse_State *state = (Parse_State *)data;
396 
397  printf("SOLE XML Parser [proc[%s]] %d\n", instruction, state->depth);
398 }
399 
400 
402  XML_Parser &p,
403  void *data)
404 {
405  (void)c; (void)p; (void)data;
406  // Parse_State *state = (Parse_State *)data;
407 
408  EST_error("SOLE XML Parser %s", get_error(p));
409 
410  est_error_throw();
411 }