42 #include "EST_THash.h"
43 #include "EST_error.h"
45 #include "rxp/XML_Parser.h"
47 static EST_Regex simpleIDRegex(
".*#id(w\\([0-9]+\\))");
48 static EST_Regex rangeIDRegex(
".*#id(w\\([0-9]+\\)).*id(w\\([0-9]+\\))");
62 Parse_State() : contents(100) {}
102 const char *instruction);
112 for(them.
begin(attributes); them ; them++)
114 (
const char *)them->k,
115 (
const char *)them->v);
118 EST_read_status solexml_read(FILE *file,
124 (void)print_attributes;
125 Sole_Parser_Class pclass;
132 XML_Parser *parser = pclass.make_parser(file, name, &state);
136 return read_format_error;
145 static
void ensure_relation(Parse_State *state)
147 if (state->rel==NULL)
149 state->rel = state->utt->create_relation(state->relName);
159 state->contents.add_item(
id, c);
172 val = attributes.
val(
"id");
175 else if (attributes.
present(
"href"))
177 val = attributes.
val(
"href");
178 int starts[EST_Regex_max_subexpressions];
179 int ends[EST_Regex_max_subexpressions];
181 if (val.
matches(simpleIDRegex, 0, starts, ends))
187 else if (val.
matches(rangeIDRegex, 0, starts, ends))
189 int n1 = atoi(val.
at(starts[1], ends[1]-starts[1]));
190 int n2 = atoi(val.
at(starts[2], ends[2]-starts[2]));
192 for(
int i=n1; i<=n2; i++)
195 sprintf(buf,
"w%d", i);
202 EST_warning(
"element with bad ID or HREF '%s'", (
const char *)val);
207 sprintf(buf,
"n%d", ++count);
224 Parse_State *state = (Parse_State *)data;
236 (void)c; (void)p; (void)data;
246 (void)c; (void)p; (void)attributes;
247 Parse_State *state = (Parse_State *)data;
251 if (strcmp(name,
"solexml")==0)
253 state->relName=attributes.
val(
"relation");
254 printf(
"start solexml relation=%s\n", (
const char *)state->relName);
257 else if (strcmp(name,
"text-elem")==0)
263 ensure_relation(state);
265 if (strcmp(name,
"anaphora-elem")==0
266 || strcmp(name,
"wordlist")==0
267 || strcmp(name,
"w")==0)
270 extract_ids(attributes, ids);
274 for(; idp!= NULL; idp = idp->next())
284 state->current = state->parent;
285 state->parent=state->parent->up();
294 for(them.
begin(attributes); them ; them++)
303 if (state->current == NULL)
304 if (state->parent == NULL)
305 item = state->rel->append();
307 item = state->parent->insert_below();
309 item = state->current->insert_after();
311 item->set_contents(cont);
318 EST_warning(
"SOLE XML Parser: unknown element %s", name);
328 (void)c; (void)p; (void)attributes;
329 Parse_State *state = (Parse_State *)data;
331 if (strcmp(name,
"language")==0)
333 state->utt->f.set(
"language", attributes.
val(
"name"));
337 element_open(c, p, data, name, attributes);
338 element_close(c, p, data, name);
347 (void)c; (void)p; (void)name;
348 Parse_State *state = (Parse_State *)data;
350 if (strcmp(name,
"anaphora-elem")==0
351 || strcmp(name,
"wordlist")==0
352 || strcmp(name,
"w")==0)
355 state->current = state->parent;
356 state->parent=state->parent->up();;
368 Parse_State *state = (Parse_State *)data;
370 if (state->parent != NULL && p.
context(0) ==
"w")
371 state->parent->set(
EST_String(
"word"), chars);
382 (void)c; (void)p; (void)data; (void)chars;
392 const char *instruction)
395 Parse_State *state = (Parse_State *)data;
397 printf(
"SOLE XML Parser [proc[%s]] %d\n", instruction, state->depth);
405 (void)c; (void)p; (void)data;
408 EST_error(
"SOLE XML Parser %s", get_error(p));