44 #include "EST_TDeque.h"
45 #include "EST_THash.h"
46 #include "EST_error.h"
48 #include "rxp/XML_Parser.h"
50 #include "ling_class_init.h"
52 #if defined(ESTLIBDIRC)
53 # define __STRINGIZE(X) #X
54 # define ESTLIBDIR __STRINGIZE(ESTLIBDIRC)
58 static EST_Regex simpleIDRegex(
"[^#]*#id(\\([-a-z0-9]+\\))");
59 static EST_Regex rangeIDRegex(
"[^#]*#id(\\([a-z]*\\)\\([0-9]*\\)\\(-\\([0-9]+\\)\\)*).*id(\\([a-z]*\\)\\([0-9]*\\)\\(-\\([0-9]+\\)\\)*)");
60 static EST_Regex featureDefRegex(
"\\([^:]*\\):\\(.*\\)");
69 #define MAX_FEATS (50)
73 class GenXML_Parse_State
94 GenXML_Parse_State() : contents(100) {}
134 const char *instruction);
145 void EST_GenXML::class_init(
void)
147 ling_class_init::use();
149 pclass =
new GenXML_Parser_Class();
151 printf(
"Register estlib in genxml %s\n", ESTLIBDIR
"/\\1.dtd");
154 pclass->
register_id(
"//CSTR EST//DTD \\(.*\\)//[A-Z]*",
155 ESTLIBDIR
"/\\1.dtd");
156 pclass->
register_id(
"//CSTR EST//ENTITIES \\(.*\\)//[A-Z]*",
157 ESTLIBDIR
"/\\1.ent");
160 void EST_GenXML::register_id(
const EST_String pattern,
171 InputSource EST_GenXML::try_and_open(Entity ent)
177 EST_read_status EST_GenXML::read_xml(FILE *file,
183 (void)print_attributes;
184 GenXML_Parse_State state;
194 return read_format_error;
203 static
void ensure_relation(GenXML_Parse_State *state,
EST_String name)
205 if (state->rel!=NULL && name == state->relName)
208 state->rel = state->utt->create_relation(state->relName=name);
218 state->contents.add_item(
id, c);
223 if (c->relations.
present(state->relName))
230 static EST_String make_new_id(
const char *root)
235 sprintf(buf,
"%s%d", root, ++count);
246 val = attributes.
val(
"id");
247 #if defined(EST_DEBUGGING)
248 fprintf(stderr,
"ID %s\n", (
const char *)val);
252 else if (attributes.
present(
"href"))
254 val = attributes.
val(
"href");
255 int starts[EST_Regex_max_subexpressions];
256 int ends[EST_Regex_max_subexpressions];
258 if (val.
matches(simpleIDRegex, 0, starts, ends))
261 #if defined(EST_DEBUGGING)
262 fprintf(stderr,
"SIMPLE %s\n", (
const char *)n);
266 else if (val.
matches(rangeIDRegex, 0, starts, ends))
268 EST_String prefix1 = val.
at(starts[1], ends[1]-starts[1]);
269 int n1 = atoi(val.
at(starts[2], ends[2]-starts[2]));
270 EST_String postfix1 = val.
at(starts[4], ends[4]-starts[4]);
271 EST_String prefix2 = val.
at(starts[5], ends[5]-starts[5]);
272 int n2 = atoi(val.
at(starts[6], ends[6]-starts[6]));
273 EST_String postfix2 = val.
at(starts[8], ends[8]-starts[8]);
275 #if defined(EST_DEBUGGING)
276 fprintf(stderr,
"RANGE '%s' %d - '%s' // '%s' %d - '%s'\n",
277 (
const char *)prefix1,
279 (
const char *)postfix1,
280 (
const char *)prefix2,
282 (
const char *)postfix2
286 if (prefix1==prefix2)
293 if (postfix1.length()==0)
295 sprintf(buf,
"%s%s%d",
296 (
const char *)prefix1,
297 (
const char *)prefix2,
306 if (postfix2.length()>0)
307 for (; c<=atoi(postfix2); c++)
309 sprintf(buf,
"%s%s%d-%d",
310 (
const char *)prefix1,
311 (
const char *)prefix2,
320 for(
int i=n1; i<=n2; i++)
323 && postfix2.length()>0)
325 sprintf(buf,
"%s%s%d",
326 (
const char *)prefix1,
327 (
const char *)prefix2,
331 for (
int c=1; c<=atoi(postfix2); c++)
333 sprintf(buf,
"%s%s%d-%d",
334 (
const char *)prefix1,
335 (
const char *)prefix2,
344 if ( postfix1.length()>0)
345 sprintf(buf,
"%s%s%d-%s",
346 (
const char *)prefix1,
347 (
const char *)prefix2,
349 (
const char *)postfix1
352 sprintf(buf, "%s%s%d",
353 (const
char *)prefix1,
354 (const
char *)prefix2,
366 EST_warning("element with bad ID or HREF '%s'", (const
char *)val);
369 ids.append(make_new_id("n"));
380 for(them.
begin(attributes); them ; them++)
382 (
const char *)them->k,
383 (
const char *)them->v);
394 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
397 state->open_depth=-1;
398 state->rel_start_depth=-1;
399 state->depth_stack.clear();
410 (void)c; (void)p; (void)data;
413 static void proccess_features(
EST_String name,
419 int starts[EST_Regex_max_subexpressions];
420 int ends[EST_Regex_max_subexpressions];
422 int n = split(defs, names, MAX_FEATS, feat_sep);
423 for(
int i=0; i<n; i++)
429 if (def.
matches(featureDefRegex, 0, starts, ends))
431 feat = def.
at(starts[1], ends[1]-starts[1]);
432 attr = def.
at(starts[2], ends[2]-starts[2]);
443 printf(
"on %s got %s(%s)=%s\n", name,
459 (void)c; (void)p; (void)attributes; (void)name;
460 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
467 if (state->utt != NULL
469 proccess_features(name, val, attributes, state->utt->f);
472 if (state->rel != NULL
474 proccess_features(name, val, attributes, state->rel->f);
485 EST_warning(
"%s\nNo feature '%s' to name relation\n", get_error(p), (
const char *)val);
488 EST_String relationType = attributes.
val(
"estRelationTypeAttr");
490 ensure_relation(state, relName);
491 state->rel_start_depth=state->depth;
492 state->linear=(attributes.
val(relationType) ==
"linear"||
493 attributes.
val(relationType) ==
"list");
495 printf(
"start of relation depth=%d name=%s type=%s\n", state->depth, (
const char *)relName, state->linear?
"linear":
"tree");
498 else if ((state->rel_start_depth >= 0 &&
504 printf(
"push depth=%d name=%s ig=%s\n", state->depth, name, (
const char *)ig);
507 ensure_relation(state, val);
509 state->depth_stack.push(state->open_depth);
510 state->open_depth=state->depth;
516 extract_ids(attributes, ids);
521 switch (ids.length())
539 for(them.
begin(attributes); them ; them++)
546 cont->
f.
set(
"id",
id);
551 if (state->current == NULL)
552 item = state->rel->append();
554 item = state->current->insert_after();
555 else if (state->current == NULL)
556 if (state->parent == NULL)
557 item = state->rel->append();
559 item = state->parent->append_daughter();
561 if (state->parent == NULL)
562 item = state->current->insert_after();
564 item = state->parent->append_daughter();
566 item->set_contents(cont);
575 bool embed = (attributes.
val(
"estExpansion") ==
"embed");
578 state->id=make_new_id(
"e");
579 element_open(c, p, data, name, attributes);
584 for(; idp!= NULL; idp = idp->next())
591 element_close(c, p, data, name);
596 element_open(c, p, data, name, attributes);
601 element_close(c, p, data, name);
607 if (state->parent!=NULL)
608 state->contentAttr = attributes.
val(
"estContentFeature");
611 printf(
"\t current=%s parent=%s contA=%s\n",
612 (
const char *)state->current->name(),
613 (
const char *)state->parent->name(),
614 (
const char *)state->contentAttr);
630 (void)c; (void)p; (void)attributes;
631 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
634 element_open(c, p, data, name, attributes);
635 element_close(c, p, data, name);
644 (void)c; (void)p; (void)name;
645 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
650 if (state->depth == state->rel_start_depth )
653 printf(
"end of relation depth=%d name=%s\n", state->depth, name);
655 state->rel_start_depth=-1;
659 state->depth == state->open_depth)
662 printf(
"pop depth=%d name=%s\n", state->depth, name);
664 state->current = state->parent;
665 state->parent=parent(state->parent);
666 state->open_depth = state->depth_stack.pop();
668 printf(
"\t current=%s parent=%s\n",
669 (
const char *)state->current->name(),
670 (
const char *)state->parent->name());
686 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
690 state->parent->set(state->contentAttr, chars);
693 printf(
"GEN XML Parser [pcdata[%s]] %d\n", chars, state->depth);
703 (void)c; (void)p; (void)data; (void)chars;
707 printf(
"GEN XML Parser [cdata[%s]] %d\n", chars, state->depth);
715 const char *instruction)
717 (void)c; (void)p; (void)instruction;
718 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
722 printf(
"GEN XML Parser [proc[%s]] %d\n", instruction, state->depth);
731 (void)c; (void)p; (void)data;
734 EST_error(
"GEN XML Parser %s", get_error(p));
742 #if defined(INSTANTIATE_TEMPLATES)
744 #include "../base_class/EST_THash.cc"