Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
genxml.cc
1  /************************************************************************/
2  /* */
3  /* Centre for Speech Technology Research */
4  /* University of Edinburgh, UK */
5  /* Copyright (c) 1996,1997 */
6  /* All Rights Reserved. */
7  /* */
8  /* Permission is hereby granted, free of charge, to use and distribute */
9  /* this software and its documentation without restriction, including */
10  /* without limitation the rights to use, copy, modify, merge, publish, */
11  /* distribute, sublicense, and/or sell copies of this work, and to */
12  /* permit persons to whom this work is furnished to do so, subject to */
13  /* the following conditions: */
14  /* 1. The code must retain the above copyright notice, this list of */
15  /* conditions and the following disclaimer. */
16  /* 2. Any modifications must be clearly marked as such. */
17  /* 3. Original authors' names are not deleted. */
18  /* 4. The authors' names are not used to endorse or promote products */
19  /* derived from this software without specific prior written */
20  /* permission. */
21  /* */
22  /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23  /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24  /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25  /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26  /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27  /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28  /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29  /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30  /* THIS SOFTWARE. */
31  /* */
32  /*************************************************************************/
33  /* */
34  /* Author: Richard Caley (rjc@cstr.ed.ac.uk) */
35  /* -------------------------------------------------------------------- */
36  /* Code to read utterances marked up in XML according to a DTD with */
37  /* certain conventions indicating the mapping from XML to Utterance. */
38  /* */
39  /*************************************************************************/
40 
41 #include <cstdlib>
42 #include <cstdio>
43 #include <cctype>
44 #include "EST_TDeque.h"
45 #include "EST_THash.h"
46 #include "EST_error.h"
47 #include "genxml.h"
48 #include "rxp/XML_Parser.h"
49 
50 #include "ling_class_init.h"
51 
52 #if defined(ESTLIBDIRC)
53 # define __STRINGIZE(X) #X
54 # define ESTLIBDIR __STRINGIZE(ESTLIBDIRC)
55 #endif
56 
57 
58 static EST_Regex simpleIDRegex("[^#]*#id(\\([-a-z0-9]+\\))");
59 static EST_Regex rangeIDRegex("[^#]*#id(\\([a-z]*\\)\\([0-9]*\\)\\(-\\([0-9]+\\)\\)*).*id(\\([a-z]*\\)\\([0-9]*\\)\\(-\\([0-9]+\\)\\)*)");
60 static EST_Regex featureDefRegex("\\([^:]*\\):\\(.*\\)");
61 
62 // Separator between feature names in attributes.
63 
64 static EST_String feat_sep(",");
65 
66 // I'd like to get rid of this. It is a maximum for the number of features
67 // which can be named in an attribute, say for copying to the utterance.
68 
69 #define MAX_FEATS (50)
70 
71 // Parse state.
72 
73 class GenXML_Parse_State
74  {
75 public:
76  int depth;
77  int open_depth;
78  int rel_start_depth;
79  EST_TDeque<int> depth_stack;
80  EST_String relName;
81  bool linear;
82  EST_Utterance *utt;
83  EST_Relation *rel;
84  EST_Item *parent;
85  EST_Item *current;
86  EST_String contentAttr;
87 
88  // used to force a given ID on a node.
89  EST_String id;
90 
92 
93 
94  GenXML_Parse_State() : contents(100) {}
95  };
96 
97 class GenXML_Parser_Class : public XML_Parser_Class
98 {
99 protected:
100  virtual void document_open(XML_Parser_Class &c,
101  XML_Parser &p,
102  void *data);
103  virtual void document_close(XML_Parser_Class &c,
104  XML_Parser &p,
105  void *data);
106 
107  virtual void element_open(XML_Parser_Class &c,
108  XML_Parser &p,
109  void *data,
110  const char *name,
111  XML_Attribute_List &attributes);
112  virtual void element(XML_Parser_Class &c,
113  XML_Parser &p,
114  void *data,
115  const char *name,
116  XML_Attribute_List &attributes);
117  virtual void element_close(XML_Parser_Class &c,
118  XML_Parser &p,
119  void *data,
120  const char *name);
121 
122  virtual void pcdata(XML_Parser_Class &c,
123  XML_Parser &p,
124  void *data,
125  const char *chars);
126  virtual void cdata(XML_Parser_Class &c,
127  XML_Parser &p,
128  void *data,
129  const char *chars);
130 
131  virtual void processing(XML_Parser_Class &c,
132  XML_Parser &p,
133  void *data,
134  const char *instruction);
135  virtual void error(XML_Parser_Class &c,
136  XML_Parser &p,
137  void *data);
138 };
139 
140 static void print_attributes(XML_Attribute_List &attributes);
141 
142 XML_Parser_Class *EST_GenXML::pclass;
143 
144 
145 void EST_GenXML::class_init(void)
146 {
147  ling_class_init::use();
148 
149  pclass = new GenXML_Parser_Class();
150 #ifdef DEBUGGING
151  printf("Register estlib in genxml %s\n", ESTLIBDIR "/\\1.dtd");
152 #endif
153 
154  pclass->register_id("//CSTR EST//DTD \\(.*\\)//[A-Z]*",
155  ESTLIBDIR "/\\1.dtd");
156  pclass->register_id("//CSTR EST//ENTITIES \\(.*\\)//[A-Z]*",
157  ESTLIBDIR "/\\1.ent");
158 }
159 
160 void EST_GenXML::register_id(const EST_String pattern,
161  const EST_String result)
162 {
163  EST_GenXML::pclass->register_id(pattern, result);
164 }
165 
166 void EST_GenXML::registered_ids(EST_StrList &list)
167 {
168  EST_GenXML::pclass->registered_ids(list);
169 }
170 
171 InputSource EST_GenXML::try_and_open(Entity ent)
172 {
173  return EST_GenXML::pclass->try_and_open(ent);
174 }
175 
176 
177 EST_read_status EST_GenXML::read_xml(FILE *file,
178  const EST_String &name,
179  EST_Utterance &u,
180  int &max_id)
181 {
182  (void)max_id;
183  (void)print_attributes; // just to shut -Wall up.
184  GenXML_Parse_State state;
185 
186  u.clear();
187 
188  state.utt=&u;
189 
190  XML_Parser *parser = EST_GenXML::pclass->make_parser(file, name, &state);
191  parser->track_context(TRUE);
192 
193  CATCH_ERRORS()
194  return read_format_error;
195 
196  parser->go();
197 
198  END_CATCH_ERRORS();
199 
200  return read_ok;
201 }
202 
203 static void ensure_relation(GenXML_Parse_State *state, EST_String name)
204 {
205  if (state->rel!=NULL && name == state->relName)
206  return;
207 
208  state->rel = state->utt->create_relation(state->relName=name);
209 }
210 
211 static EST_Item_Content *get_contents(GenXML_Parse_State *state, EST_String id)
212 {
213  EST_Item_Content *c = state->contents.val(id);
214 
215  if (c==NULL)
216  {
217  c = new EST_Item_Content();
218  state->contents.add_item(id, c);
219  c->f.set("id", id);
220  }
221  else
222  {
223  if (c->relations.present(state->relName))
224  return NULL;
225  }
226 
227  return c;
228 }
229 
230 static EST_String make_new_id(const char *root)
231 {
232  char buf[100];
233  static int count=0;
234 
235  sprintf(buf, "%s%d", root, ++count);
236  return buf;
237 }
238 
239 
240 static void extract_ids(XML_Attribute_List &attributes,
242 {
243  EST_String val;
244  if (attributes.present("id"))
245  {
246  val = attributes.val("id");
247 #if defined(EST_DEBUGGING)
248  fprintf(stderr, "ID %s\n", (const char *)val);
249 #endif
250  ids.append(val);
251  }
252  else if (attributes.present("href"))
253  {
254  val = attributes.val("href");
255  int starts[EST_Regex_max_subexpressions];
256  int ends[EST_Regex_max_subexpressions];
257 
258  if (val.matches(simpleIDRegex, 0, starts, ends))
259  {
260  EST_String n = val.at(starts[1], ends[1]-starts[1]);
261 #if defined(EST_DEBUGGING)
262  fprintf(stderr, "SIMPLE %s\n", (const char *)n);
263 #endif
264  ids.append(n);
265  }
266  else if (val.matches(rangeIDRegex, 0, starts, ends))
267  {
268  EST_String prefix1 = val.at(starts[1], ends[1]-starts[1]);
269  int n1 = atoi(val.at(starts[2], ends[2]-starts[2]));
270  EST_String postfix1 = val.at(starts[4], ends[4]-starts[4]);
271  EST_String prefix2 = val.at(starts[5], ends[5]-starts[5]);
272  int n2 = atoi(val.at(starts[6], ends[6]-starts[6]));
273  EST_String postfix2 = val.at(starts[8], ends[8]-starts[8]);
274 
275 #if defined(EST_DEBUGGING)
276  fprintf(stderr, "RANGE '%s' %d - '%s' // '%s' %d - '%s'\n",
277  (const char *)prefix1,
278  n1,
279  (const char *)postfix1,
280  (const char *)prefix2,
281  n2,
282  (const char *)postfix2
283  );
284 #endif
285 
286  if (prefix1==prefix2)
287  prefix2="";
288 
289  char buf[100];
290  if (n1==n2)
291  {
292  int c;
293  if (postfix1.length()==0)
294  {
295  sprintf(buf, "%s%s%d",
296  (const char *)prefix1,
297  (const char *)prefix2,
298  n1
299  );
300  ids.append(buf);
301  c=1;
302  }
303  else
304  c=atoi(postfix1);
305 
306  if (postfix2.length()>0)
307  for (; c<=atoi(postfix2); c++)
308  {
309  sprintf(buf, "%s%s%d-%d",
310  (const char *)prefix1,
311  (const char *)prefix2,
312  n1,
313  c
314  );
315  ids.append(buf);
316  }
317  }
318  else
319  {
320  for(int i=n1; i<=n2; i++)
321  {
322  if (i==n2
323  && postfix2.length()>0)
324  {
325  sprintf(buf, "%s%s%d",
326  (const char *)prefix1,
327  (const char *)prefix2,
328  i
329  );
330  ids.append(buf);
331  for (int c=1; c<=atoi(postfix2); c++)
332  {
333  sprintf(buf, "%s%s%d-%d",
334  (const char *)prefix1,
335  (const char *)prefix2,
336  i,
337  c
338  );
339  ids.append(buf);
340  }
341  }
342  else
343  {
344  if ( postfix1.length()>0)
345  sprintf(buf, "%s%s%d-%s",
346  (const char *)prefix1,
347  (const char *)prefix2,
348  i,
349  (const char *)postfix1
350  );
351  else
352  sprintf(buf, "%s%s%d",
353  (const char *)prefix1,
354  (const char *)prefix2,
355  i
356  );
357 
358  ids.append(buf);
359  }
360  postfix1="";
361  }
362 
363  }
364  }
365  else
366  EST_warning("element with bad ID or HREF '%s'", (const char *)val);
367  }
368  else
369  ids.append(make_new_id("n"));
370 
371  // cout << ids << "\n";
372 }
373 
374 /* For debugging.
375  */
376 static void print_attributes(XML_Attribute_List &attributes)
377 {
379 
380  for(them.begin(attributes); them ; them++)
381  printf(" %s='%s'",
382  (const char *)them->k,
383  (const char *)them->v);
384 }
385 
386 /** Now we define the callbacks.
387  */
388 
389 void GenXML_Parser_Class::document_open(XML_Parser_Class &c,
390  XML_Parser &p,
391  void *data)
392 {
393  (void)c; (void)p;
394  GenXML_Parse_State *state = (GenXML_Parse_State *)data;
395 
396  state->depth=1;
397  state->open_depth=-1;
398  state->rel_start_depth=-1;
399  state->depth_stack.clear();
400  state->rel=NULL;
401  state->parent=NULL;
402  state->current=NULL;
403  state->id="";
404 }
405 
406 void GenXML_Parser_Class::document_close(XML_Parser_Class &c,
407  XML_Parser &p,
408  void *data)
409 {
410  (void)c; (void)p; (void)data;
411 }
412 
413 static void proccess_features(EST_String name,
414  EST_String defs,
415  XML_Attribute_List &attributes,
416  EST_Features &f)
417 {
418  EST_String names[MAX_FEATS];
419  int starts[EST_Regex_max_subexpressions];
420  int ends[EST_Regex_max_subexpressions];
421 
422  int n = split(defs, names, MAX_FEATS, feat_sep);
423  for(int i=0; i<n; i++)
424  {
425  EST_String def = names[i];
426  EST_String feat;
427  EST_String attr;
428 
429  if (def.matches(featureDefRegex, 0, starts, ends))
430  {
431  feat = def.at(starts[1], ends[1]-starts[1]);
432  attr = def.at(starts[2], ends[2]-starts[2]);
433  }
434  else
435  {
436  attr=def;
437  feat=EST_String::cat(name, "_", attr);
438  }
439 
440  EST_String fval = attributes.val(attr);
441 
442 #ifdef DEBUGGING
443  printf("on %s got %s(%s)=%s\n", name,
444  (const char *)feat,
445  (const char *)attr,
446  (const char *)fval);
447 #endif
448  if (fval != EST_String::Empty)
449  f.set(feat, fval);
450  }
451 }
452 
453 void GenXML_Parser_Class::element_open(XML_Parser_Class &c,
454  XML_Parser &p,
455  void *data,
456  const char *name,
457  XML_Attribute_List &attributes)
458 {
459  (void)c; (void)p; (void)attributes; (void)name;
460  GenXML_Parse_State *state = (GenXML_Parse_State *)data;
461 
462  state->depth++;
463 
464  EST_String val, ig;
465 
466  // Features to copy to utterance
467  if (state->utt != NULL
468  && (val=attributes.val("estUttFeats")) != EST_String::Empty)
469  proccess_features(name, val, attributes, state->utt->f);
470 
471  // Features to copy to relation
472  if (state->rel != NULL
473  && (val=attributes.val("estRelFeats")) != EST_String::Empty)
474  proccess_features(name, val, attributes, state->rel->f);
475 
476 
477  if ((val=attributes.val("estRelationElementAttr")) != EST_String::Empty)
478  {
479  // All nodes inside this element are in the given relation
480  EST_String relName = attributes.val(val);
481 
482  if (relName == EST_String::Empty)
483  {
484  relName = "UNNAMED";
485  EST_warning("%s\nNo feature '%s' to name relation\n", get_error(p), (const char *)val);
486  }
487 
488  EST_String relationType = attributes.val("estRelationTypeAttr");
489 
490  ensure_relation(state, relName);
491  state->rel_start_depth=state->depth;
492  state->linear=(attributes.val(relationType) == "linear"||
493  attributes.val(relationType) == "list");
494 #ifdef DEBUGGING
495  printf("start of relation depth=%d name=%s type=%s\n", state->depth, (const char *)relName, state->linear?"linear":"tree");
496 #endif
497  }
498  else if ((state->rel_start_depth >= 0 &&
499  (ig=attributes.val("estRelationIgnore")) == EST_String::Empty)
500  || (val=attributes.val("estRelationNode")) != EST_String::Empty)
501  {
502  // This node defines an Item in a relation.
503 #ifdef DEBUGGING
504  printf("push depth=%d name=%s ig=%s\n", state->depth, name, (const char *)ig);
505 #endif
506  if (val != EST_String::Empty)
507  ensure_relation(state, val);
508 
509  state->depth_stack.push(state->open_depth);
510  state->open_depth=state->depth;
511 
513 
514  if (state->id == EST_String::Empty)
515  {
516  extract_ids(attributes, ids);
517  }
518  else
519  ids.append(state->id);
520 
521  switch (ids.length())
522  {
523  case 0:
524  XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
525  break;
526  case 1:
527  {
528  EST_String id = ids.first();
529 
530  if (id==EST_String::Empty)
531  XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
532 
533  EST_Item_Content *cont = get_contents(state, id);
534 
535  if (!cont)
536  XML_Parser_Class::error(c, p, data, EST_String("Repeated Id ") + id);
537 
539  for(them.begin(attributes); them ; them++)
540  {
541  EST_String k = them->k;
542  EST_String v = them->v;
543  cont->f.set(k,v);
544  }
545 
546  cont->f.set("id", id);
547 
548  EST_Item *item;
549 
550  if (state->linear)
551  if (state->current == NULL)
552  item = state->rel->append();
553  else
554  item = state->current->insert_after();
555  else if (state->current == NULL)
556  if (state->parent == NULL)
557  item = state->rel->append();
558  else
559  item = state->parent->append_daughter();
560  else
561  if (state->parent == NULL)
562  item = state->current->insert_after();
563  else
564  item = state->parent->append_daughter();
565 
566  item->set_contents(cont);
567 
568  state->current=NULL;
569  state->parent=item;
570  }
571  break;
572 
573  default:
574  {
575  bool embed = (attributes.val("estExpansion") == "embed");
576  if (embed)
577  {
578  state->id=make_new_id("e");
579  element_open(c, p, data, name, attributes);
580  state->id="";
581  }
582  EST_Litem *idp = ids.head();
583  bool first=TRUE;
584  for(; idp!= NULL; idp = idp->next())
585  {
586  EST_String id = ids(idp);
587  if (id==EST_String::Empty)
588  XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
589 
590  if (!first)
591  element_close(c, p, data, name);
592  else
593  first=FALSE;
594 
595  state->id=id;
596  element_open(c, p, data, name, attributes);
597  state->id=EST_String::Empty;
598  }
599  if (embed)
600  {
601  element_close(c, p, data, name);
602  }
603  }
604  }
605 
606 
607  if (state->parent!=NULL)
608  state->contentAttr = attributes.val("estContentFeature");
609 
610 #ifdef DEBUGGING
611  printf("\t current=%s parent=%s contA=%s\n",
612  (const char *)state->current->name(),
613  (const char *)state->parent->name(),
614  (const char *)state->contentAttr);
615 #endif
616 
617  }
618  else
619  ; // Skip
620 
621 }
622 
623 
624 void GenXML_Parser_Class::element(XML_Parser_Class &c,
625  XML_Parser &p,
626  void *data,
627  const char *name,
628  XML_Attribute_List &attributes)
629 {
630  (void)c; (void)p; (void)attributes;
631  GenXML_Parse_State *state = (GenXML_Parse_State *)data;
632  (void)state;
633 
634  element_open(c, p, data, name, attributes);
635  element_close(c, p, data, name);
636 }
637 
638 
639 void GenXML_Parser_Class::element_close(XML_Parser_Class &c,
640  XML_Parser &p,
641  void *data,
642  const char *name)
643 {
644  (void)c; (void)p; (void)name;
645  GenXML_Parse_State *state = (GenXML_Parse_State *)data;
646 
647  EST_String val;
648 
649 
650  if (state->depth == state->rel_start_depth )
651  {
652 #ifdef DEBUGGING
653  printf("end of relation depth=%d name=%s\n", state->depth, name);
654 #endif
655  state->rel_start_depth=-1;
656  }
657 
658  if (
659  state->depth == state->open_depth)
660  {
661 #ifdef DEBUGGING
662  printf("pop depth=%d name=%s\n", state->depth, name);
663 #endif
664  state->current = state->parent;
665  state->parent=parent(state->parent);
666  state->open_depth = state->depth_stack.pop();
667 #ifdef DEBUGGING
668  printf("\t current=%s parent=%s\n",
669  (const char *)state->current->name(),
670  (const char *)state->parent->name());
671 #endif
672  }
673 
674 
675  state->depth--;
676 }
677 
678 
679 void GenXML_Parser_Class::pcdata(XML_Parser_Class &c,
680  XML_Parser &p,
681  void *data,
682  const char *chars)
683 {
684  (void)c;
685  (void)p;
686  GenXML_Parse_State *state = (GenXML_Parse_State *)data;
687 
688 
689  if ( state->parent != NULL && state->contentAttr != EST_String::Empty)
690  state->parent->set(state->contentAttr, chars);
691 
692 #ifdef DEBUGGING
693  printf("GEN XML Parser [pcdata[%s]] %d\n", chars, state->depth);
694 #endif
695 }
696 
697 
698 void GenXML_Parser_Class::cdata(XML_Parser_Class &c,
699  XML_Parser &p,
700  void *data,
701  const char *chars)
702 {
703  (void)c; (void)p; (void)data; (void)chars;
704  // GenXML_Parse_State *state = (GenXML_Parse_State *)data;
705 
706 #ifdef DEBUGGING
707  printf("GEN XML Parser [cdata[%s]] %d\n", chars, state->depth);
708 #endif
709 }
710 
711 
712 void GenXML_Parser_Class::processing(XML_Parser_Class &c,
713  XML_Parser &p,
714  void *data,
715  const char *instruction)
716 {
717  (void)c; (void)p; (void)instruction;
718  GenXML_Parse_State *state = (GenXML_Parse_State *)data;
719  (void)state;
720 
721 #ifdef DEBUGGING
722  printf("GEN XML Parser [proc[%s]] %d\n", instruction, state->depth);
723 #endif
724 }
725 
726 
728  XML_Parser &p,
729  void *data)
730 {
731  (void)c; (void)p; (void)data;
732  // GenXML_Parse_State *state = (GenXML_Parse_State *)data;
733 
734  EST_error("GEN XML Parser %s", get_error(p));
735 
736  est_error_throw();
737 }
738 
741 
742 #if defined(INSTANTIATE_TEMPLATES)
743 
744 #include "../base_class/EST_THash.cc"
745 
746 Instantiate_TStringHash_T(EST_Item_Content *, THash_String_ItemC_P)
747 
748 #endif