Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
xmlparser.c
1 /*************************************************************************/
2 /* */
3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4 /* University of Edinburgh. */
5 /* */
6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13 /* */
14 /*************************************************************************/
15 /* $Id: xmlparser.c,v 1.3 2004/05/04 00:00:17 awb Exp $ */
16 
17 #ifndef lint
18 static char vcid[] = "$Id: xmlparser.c,v 1.3 2004/05/04 00:00:17 awb Exp $";
19 #endif /* lint */
20 
21 /*
22  * XML (and nSGML) parser.
23  * Author: Richard Tobin.
24  */
25 
26 #include <stdarg.h>
27 #include <stdlib.h>
28 
29 #ifdef FOR_LT
30 
31 #include "lt-memory.h"
32 #include "nsllib.h"
33 
34 #define Malloc salloc
35 #define Realloc srealloc
36 #define Free sfree
37 
38 #else
39 
40 #include "system.h"
41 
42 #endif
43 
44 #include "charset.h"
45 #include "string16.h"
46 #include "ctype16.h"
47 #include "dtd.h"
48 #include "input.h"
49 #include "stdio16.h"
50 #include "xmlparser.h"
51 
52 static int transcribe(Parser p, int back, int count);
53 static void pop_while_at_eoe(Parser p);
54 static void maybe_uppercase(Parser p, Char *s);
55 static void maybe_uppercase_name(Parser p);
56 static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b);
57 static int is_ascii_alpha(int c);
58 static int is_ascii_digit(int c);
59 static int parse_external_id(Parser p, int required,
60  char8 **publicid, char8 **systemid,
61  int preq, int sreq);
62 static int parse_conditional(Parser p);
63 static int parse_notation_decl(Parser p);
64 static int parse_entity_decl(Parser p, Entity ent, int line, int chpos);
65 static int parse_attlist_decl(Parser p);
66 static int parse_element_decl(Parser p);
67 static ContentParticle parse_cp(Parser p);
68 static ContentParticle parse_choice_or_seq(Parser p);
69 static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren,char sep);
70 static int check_content_decl(Parser p, ContentParticle cp);
71 static int check_content_decl_1(Parser p, ContentParticle cp);
72 static Char *stringify_cp(ContentParticle cp);
73 static void print_cp(ContentParticle cp, FILE16 *f);
74 static int size_cp(ContentParticle cp);
75 void FreeContentParticle(ContentParticle cp);
76 static int parse_reference(Parser p, int pe, int expand, int allow_external);
77 static int parse_character_reference(Parser p, int expand);
78 static const char8 *escape(int c);
79 static int parse_name(Parser p, const char8 *where);
80 static int parse_nmtoken(Parser p, const char8 *where);
81 static int looking_at(Parser p, const char8 *string);
82 static void clear_xbit(XBit xbit);
83 static int expect(Parser p, int expected, const char8 *where);
84 static int expect_dtd_whitespace(Parser p, const char8 *where);
85 static void skip_whitespace(InputSource s);
86 static int skip_dtd_whitespace(Parser p, int allow_pe);
87 static int parse_cdata(Parser p);
88 static int process_nsl_decl(Parser p);
89 static int process_xml_decl(Parser p);
90 static int parse_dtd(Parser p);
91 static int read_markupdecls(Parser p);
92 static int error(Parser p, const char8 *format, ...);
93 static void warn(Parser p, const char8 *format, ...);
94 static void verror(XBit bit, const char8 *format, va_list args);
95 enum literal_type {LT_cdata_attr, LT_tok_attr, LT_plain, LT_entity};
96 static int parse_string(Parser p, const char8 *where, enum literal_type type);
97 static int parse_pi(Parser p);
98 static int parse_comment(Parser p, int skip);
99 static int parse_pcdata(Parser p);
100 static int parse_starttag(Parser p);
101 static int parse_attribute(Parser p);
102 static int parse_endtag(Parser p);
103 static int parse_markup(Parser p);
104 static int parse(Parser p);
105 static int parse_markupdecl(Parser p);
106 
107 #define require(x) if(x >= 0) {} else return -1
108 #define require0(x) if(x >= 0) {} else return 0
109 
110 #define Consume(buf) (buf = 0, buf##size = 0)
111 #define ExpandBuf(buf, sz) \
112  if(buf##size >= (sz)+1) {} else if((buf = Realloc(buf, (buf##size = sz + 1) * sizeof(Char)))) {} else return error(p, "System error")
113 
114 #define CopyName(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else return error(p, "System error");
115 
116 #define CopyName0(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else {error(p, "System error"); return 0;}
117 
118 const char8 *XBitTypeName[XBIT_enum_count] = {
119  "dtd",
120  "start",
121  "empty",
122  "end",
123  "eof",
124  "pcdata",
125  "pi",
126  "comment",
127  "cdsect",
128  "xml",
129  "error",
130  "warning",
131  "none"
132 };
133 
134 static Entity xml_builtin_entity;
135 static Entity xml_predefined_entities;
136 
137 int ParserInit(void)
138 {
139  static int initialised = 0;
140  Entity e, f;
141  int i;
142  static const Char lt[] = {'l','t',0}, ltval[] = {'&','#','6','0',';',0};
143  static const Char gt[] = {'g','t',0}, gtval[] = {'&','#','6','2',';',0};
144  static const Char amp[] = {'a','m','p',0},
145  ampval[] = {'&','#','3','8',';',0};
146  static const Char apos[] = {'a','p','o','s',0}, aposval[] = {'\'',0};
147  static const Char quot[] = {'q','u','o','t',0}, quotval[] = {'"',0};
148  static const Char *builtins[5][2] = {
149  {lt, ltval}, {gt, gtval}, {amp, ampval},
150  {apos, aposval}, {quot, quotval}
151  };
152  (void)vcid;
153 
154  if(initialised)
155  return 0;
156  initialised = 1;
157 
158  init_charset();
159  init_ctype16();
160  init_stdio16();
161 
162  for(i=0, f=0; i<5; i++, f=e)
163  {
164  e = NewInternalEntity(builtins[i][0], builtins[i][1],
165  xml_builtin_entity, 0, 0, 0);
166  if(!e)
167  return -1;
168  e->next = f;
169  }
170 
171  xml_predefined_entities = e;
172 
173  return 0;
174 }
175 
176 static void skip_whitespace(InputSource s)
177 {
178  int c;
179 
180  while((c = get(s)) != XEOE && is_xml_whitespace(c))
181  ;
182  unget(s);
183 }
184 
185 /*
186  * Skip whitespace and (optionally) the start and end of PEs. Return 1 if
187  * there actually *was* some whitespace or a PE start/end, -1 if
188  * an error occurred, 0 otherwise.
189  */
190 
191 static int skip_dtd_whitespace(Parser p, int allow_pe)
192 {
193  int c;
194  int got_some = 0;
195  InputSource s = p->source;
196 
197  while(1)
198  {
199  c = get(s);
200 
201  if(c == XEOE)
202  {
203  got_some = 1;
204  if(s->parent)
205  {
206  if(!allow_pe)
207  return error(p,
208  "PE end not allowed here in internal subset");
209  if(s->entity->type == ET_external)
210  p->external_pe_depth--;
211  ParserPop(p);
212  s = p->source;
213  }
214  else
215  {
216  unget(s); /* leave the final EOE waiting to be read */
217  return got_some;
218  }
219  }
220  else if(is_xml_whitespace(c))
221  {
222  got_some = 1;
223  }
224  else if(c == '%')
225  {
226  /* this complication is needed for <!ENTITY % ...
227  otherwise we could just assume it was a PE reference. */
228 
229  c = get(s); unget(s);
230  if(c != XEOE && is_xml_namestart(c))
231  {
232  if(!allow_pe)
233  {
234  unget(s); /* For error position */
235  return error(p,
236  "PE ref not allowed here in internal subset");
237  }
238  require(parse_reference(p, 1, 1, 1));
239  s = p->source;
240  if(s->entity->type == ET_external)
241  p->external_pe_depth++;
242  got_some = 1;
243  }
244  else
245  {
246  unget(s);
247  return got_some;
248  }
249  }
250  else
251  {
252  unget(s);
253  return got_some;
254  }
255  }
256 }
257 
258 static int expect(Parser p, int expected, const char8 *where)
259 {
260  int c;
261  InputSource s = p->source;
262 
263  c = get(s);
264  if(c != expected)
265  {
266  unget(s); /* For error position */
267  return error(p, "Expected %s %s, but got %s",
268  escape(expected), where, escape(c));
269  }
270 
271  return 0;
272 }
273 
274 /*
275  * Expects whitespace or the start or end of a PE.
276  */
277 
278 static int expect_dtd_whitespace(Parser p, const char8 *where)
279 {
280  int r = skip_dtd_whitespace(p, p->external_pe_depth > 0);
281 
282  if(r < 0)
283  return -1;
284 
285  if(r == 0)
286  return error(p, "Expected whitespace %s", where);
287 
288  return 0;
289 }
290 
291 static void clear_xbit(XBit xbit)
292 {
293  xbit->type = XBIT_none;
294  xbit->s1 = xbit->s2 = 0;
295  xbit->S1 = xbit->S2 = 0;
296  xbit->attributes = 0;
297  xbit->element_definition = 0;
298 }
299 
300 void FreeXBit(XBit xbit)
301 {
302  Attribute a, b;
303 
304  if(xbit->S1) Free(xbit->S1);
305  if(xbit->S2) Free(xbit->S2);
306  if(xbit->type != XBIT_error && xbit->type != XBIT_warning && xbit->s1)
307  Free(xbit->s1);
308  if(xbit->s2) Free(xbit->s2);
309  for(a = xbit->attributes; a; a = b)
310  {
311  b = a->next;
312  if(a->value) Free(a->value);
313  Free(a);
314  }
315  clear_xbit(xbit);
316 }
317 
318 /*
319  * Returns 1 if the input matches string (and consume the input).
320  * Otherwise returns 0 and leaves the input stream where it was.
321  * Case-sensitivity depends on the CaseInsensitive flag.
322  * A space character at end of string matches any (non-zero) amount of
323  * whitespace; space are treated literally elsewhere.
324  * Never reads beyond an end-of-line, except to consume
325  * extra whitespace when the last character of string is a space.
326  * Never reads beyond end-of-entity.
327  */
328 
329 static int looking_at(Parser p, const char8 *string)
330 {
331  InputSource s = p->source;
332  int c, d;
333  int save = s->next;
334 
335  for(c = *string++; c; c = *string++)
336  {
337  if(at_eol(s))
338  goto fail; /* We would go over a line end */
339 
340  d = get(s);
341 
342  if(c == ' ' && *string == 0)
343  {
344  if(d == XEOE || !is_xml_whitespace(d))
345  goto fail;
346  skip_whitespace(s);
347  }
348  else
349  if((ParserGetFlag(p, CaseInsensitive) &&
350  Toupper(d) != Toupper(c)) ||
351  (!ParserGetFlag(p, CaseInsensitive) && d != c))
352  goto fail;
353  }
354 
355  return 1;
356 
357 fail:
358  s->next = save;
359  return 0;
360 }
361 
362 static int parse_name(Parser p, const char8 *where)
363 {
364  InputSource s = p->source;
365  int c, i;
366 
367  c = get(s);
368  if(c == XEOE || !is_xml_namestart(c))
369  {
370  unget(s); /* For error position */
371  error(p, "Expected name, but got %s %s", escape(c), where);
372  return -1;
373  }
374  i = 1;
375 
376  while(c = get(s), (c != XEOE && is_xml_namechar(c)))
377  i++;
378  unget(s);
379 
380  p->name = s->line + s->next - i;
381  p->namelen = i;
382 
383  return 0;
384 }
385 
386 static int parse_nmtoken(Parser p, const char8 *where)
387 {
388  InputSource s = p->source;
389  int c, i=0;
390 
391  while(c = get(s), (c !=XEOE && is_xml_namechar(c)))
392  i++;
393  unget(s);
394 
395  if(i == 0)
396  return error(p, "Expected nmtoken value, but got %s %s",
397  escape(c), where);
398 
399  p->name = s->line + s->next - i;
400  p->namelen = i;
401 
402  return 0;
403 }
404 
405 /* Escape a character for printing n an error message.
406  NB returns 5 static storage buffers in rotation. */
407 
408 static const char8 *escape(int c)
409 {
410  static char8 buf[5][15];
411  static int bufnum=-1;
412 
413 #if CHAR_SIZE == 8
414  if(c != XEOE)
415  c &= 0xff;
416 #endif
417 
418  bufnum = (bufnum + 1) % 5;
419 
420  if(c == XEOE)
421  return "<EOE>";
422  else if(c >= 33 && c <= 126)
423  sprintf(buf[bufnum], "%c", c);
424  else if(c == ' ')
425  sprintf(buf[bufnum], "<space>");
426  else
427  sprintf(buf[bufnum], "<0x%x>", c);
428 
429  return buf[bufnum];
430 }
431 
432 Parser NewParser(void)
433 {
434  Parser p;
435 
436  if(ParserInit() == -1)
437  return 0;
438 
439  p = Malloc(sizeof(*p));
440  if(!p)
441  return 0;
442  p->state = PS_prolog1;
443  p->document_entity = 0; /* Set at first ParserPush */
444  p->have_dtd = 0;
445  p->standalone = SDD_unspecified;
446  p->flags = 0;
447  p->source = 0;
448  clear_xbit(&p->xbit);
449 #ifndef FOR_LT
450  p->xbit.nchildren = 0; /* These three should never be changed */
451  p->xbit.children = 0;
452  p->xbit.parent = 0;
453 #endif
454  p->pbufsize = p->pbufnext = 0;
455  p->pbuf = 0;
456  p->peeked = 0;
457  p->dtd = NewDtd();
458  p->dtd_callback = p->warning_callback = 0;
459  p->entity_opener = 0;
460  p->callback_arg = 0;
461  p->external_pe_depth = 0;
462 
463  p->element_stack = 0;
464  p->element_stack_alloc = 0;
465  p->element_depth = 0;
466 
467  ParserSetFlag(p, XMLPiEnd, 1);
468  ParserSetFlag(p, XMLEmptyTagEnd, 1);
469  ParserSetFlag(p, XMLPredefinedEntities, 1);
470  ParserSetFlag(p, XMLExternalIDs, 1);
471  ParserSetFlag(p, XMLMiscWFErrors, 1);
472  ParserSetFlag(p, ErrorOnUnquotedAttributeValues, 1);
473  ParserSetFlag(p, XMLLessThan, 1);
474  ParserSetFlag(p, IgnoreEntities, 0);
475  ParserSetFlag(p, ExpandGeneralEntities, 1);
476  ParserSetFlag(p, ExpandCharacterEntities, 1);
477  ParserSetFlag(p, NormaliseAttributeValues, 1);
478  ParserSetFlag(p, WarnOnUndefinedElements, 1);
479  ParserSetFlag(p, WarnOnUndefinedAttributes, 1);
480  ParserSetFlag(p, WarnOnRedefinitions, 1);
481  ParserSetFlag(p, TrustSDD, 1);
482  ParserSetFlag(p, ReturnComments, 1);
483  ParserSetFlag(p, CheckEndTagsMatch, 1);
484 
485  return p;
486 }
487 
488 void FreeParser(Parser p)
489 {
490  while (p->source)
491  ParserPop(p); /* Will close file */
492 
493  Free(p->pbuf);
494  Free(p->element_stack);
495  Free(p);
496 }
497 
498 InputSource ParserRootSource(Parser p)
499 {
500  InputSource s;
501 
502  for(s=p->source; s && s->parent; s = s->parent)
503  ;
504 
505  return s;
506 }
507 
508 Entity ParserRootEntity(Parser p)
509 {
510  return ParserRootSource(p)->entity;
511 }
512 
513 void ParserSetCallbackArg(Parser p, void *arg)
514 {
515  p->callback_arg = arg;
516 }
517 
518 void ParserSetDtdCallback(Parser p, CallbackProc cb)
519 {
520  p->dtd_callback = cb;
521 }
522 
523 void ParserSetWarningCallback(Parser p, CallbackProc cb)
524 {
525  p->warning_callback = cb;
526 }
527 
528 void ParserSetEntityOpener(Parser p, EntityOpenerProc opener)
529 {
530  p->entity_opener = opener;
531 }
532 
533 #ifndef FOR_LT
534 
535 XBit ReadXTree(Parser p)
536 {
537  XBit bit, tree, child;
538  XBit *children;
539 
540  bit = ReadXBit(p);
541 
542  switch(bit->type)
543  {
544  case XBIT_error:
545  return bit;
546 
547  case XBIT_start:
548  if(!(tree = Malloc(sizeof(*tree))))
549  {
550  error(p, "System error");
551  return &p->xbit;
552  }
553  *tree = *bit;
554  while(1)
555  {
556  child = ReadXTree(p);
557  switch(child->type)
558  {
559  case XBIT_error:
560  FreeXTree(tree);
561  return child;
562 
563  case XBIT_eof:
564  FreeXTree(tree);
565  {
566  error(p, "EOF in element");
567  return &p->xbit;
568  }
569 
570  case XBIT_end:
571  if(child->element_definition != tree->element_definition)
572  {
573  const Char *name1 = tree->element_definition->name,
574  *name2 = child->element_definition->name;
575  FreeXTree(tree);
576  FreeXTree(child);
577  error(p, "Mismatched end tag: expected </%S>, got </%S>",
578  name1, name2);
579  return &p->xbit;
580  }
581  FreeXTree(child);
582  return tree;
583 
584  default:
585  children = Realloc(tree->children,
586  (tree->nchildren + 1) * sizeof(XBit));
587  if(!children)
588  {
589  FreeXTree(tree);
590  FreeXTree(child);
591  error(p, "System error");
592  return &p->xbit;
593  }
594  child->parent = tree;
595  children[tree->nchildren] = child;
596  tree->nchildren++;
597  tree->children = children;
598  break;
599  }
600  }
601 
602  default:
603  if(!(tree = Malloc(sizeof(*tree))))
604  {
605  error(p, "System error");
606  return &p->xbit;
607  }
608  *tree = *bit;
609  return tree;
610  }
611 }
612 
613 void FreeXTree(XBit tree)
614 {
615  int i;
616 
617  for(i=0; i<tree->nchildren; i++)
618  FreeXTree(tree->children[i]);
619 
620  Free(tree->children);
621 
622  FreeXBit(tree);
623 
624  if(tree->type == XBIT_error)
625  /* error "trees" are always in the Parser structure, not malloced */
626  return;
627 
628  Free(tree);
629 }
630 
631 #endif /* (not) FOR_LT */
632 
633 XBit ReadXBit(Parser p)
634 {
635  if(p->peeked)
636  p->peeked = 0;
637  else
638  parse(p);
639 
640  return &p->xbit;
641 }
642 
643 XBit PeekXBit(Parser p)
644 {
645  if(p->peeked)
646  error(p, "Attempt to peek twice");
647  else
648  {
649  parse(p);
650  p->peeked = 1;
651  }
652 
653  return &p->xbit;
654 }
655 
656 int ParserPush(Parser p, InputSource source)
657 {
658  if(!p->source && !p->document_entity)
659  p->document_entity = source->entity;
660 
661  source->parent = p->source;
662  p->source = source;
663 
664  if(source->entity->type == ET_internal)
665  return 0;
666 
667  /* Look at first few bytes of external entities to guess encoding,
668  then look for an XMLDecl or TextDecl. */
669 
670  if(source->entity->encoding == CE_unknown) /* we might already know */
671  determine_character_encoding(source);
672 
673 #if CHAR_SIZE == 8
674  if(!EncodingIsAsciiSuperset(source->entity->encoding))
675  return error(p, "Unsupported character encoding %s",
676  CharacterEncodingName[source->entity->encoding]);
677 #else
678  if(source->entity->encoding == CE_unknown)
679  return error(p, "Unknown character encoding");
680 #endif
681 
682  get(source); unget(source); /* To get the first line read */
683 
684  source->entity->ml_decl = ML_unspecified;
685  if(looking_at(p, "<?NSL "))
686  return process_nsl_decl(p);
687  if(looking_at(p, "<?xml "))
688  {
689  require(process_xml_decl(p));
690  if(source->entity == p->document_entity &&
691  !source->entity->version_decl)
692  return error(p, "XML declaration in document entity lacked "
693  "version number");
694  if(source->entity != p->document_entity &&
695  source->entity->standalone_decl != SDD_unspecified)
696  return error(p, "Standalone attribute not allowed except in "
697  "document entity");
698  return 0;
699  }
700  else if(!ParserGetFlag(p, XMLStrictWFErrors) && looking_at(p, "<?XML "))
701  {
702  warn(p, "Found <?XML instead of <?xml; switching to case-"
703  "insensitive mode");
704  ParserSetFlag(p, CaseInsensitive, 1);
705  return process_xml_decl(p);
706  }
707  else
708  return 0;
709 }
710 
711 void ParserPop(Parser p)
712 {
713  InputSource source;
714 
715  source = p->source;
716  Fclose(source->file16);
717  p->source = source->parent;
718 
719  if(source->entity->type == ET_external)
720  Free(source->line);
721  Free(source);
722 }
723 
724 /* Returns true if the source is at EOE. If so, the EOE will have been read. */
725 
726 static int at_eoe(InputSource s)
727 {
728  if(!at_eol(s))
729  return 0;
730  if(s->seen_eoe || get_with_fill(s) == XEOE)
731  return 1;
732  unget(s);
733  return 0;
734 }
735 
736 /* Pops any sources that are at EOE. Leaves source buffer with at least
737  one character in it (except at EOF, where it leaves the EOE unread). */
738 
739 static void pop_while_at_eoe(Parser p)
740 {
741  while(1)
742  {
743  InputSource s = p->source;
744 
745  if(!at_eoe(s))
746  return;
747  if(!s->parent)
748  {
749  unget(s);
750  return;
751  }
752  ParserPop(p);
753  }
754 }
755 
756 void ParserSetFlag(Parser p, ParserFlag flag, int value)
757 {
758  if(value)
759  p->flags |= (1 << flag);
760  else
761  p->flags &= ~(1 << flag);
762 
763  if(flag == XMLPredefinedEntities)
764  {
765  if(value)
766  p->dtd->predefined_entities = xml_predefined_entities;
767  else
768  p->dtd->predefined_entities = 0;
769  }
770 }
771 
772 void ParserPerror(Parser p, XBit bit)
773 {
774  int linenum, charnum;
775  InputSource s;
776 
777  Fprintf(Stderr, "%s: %s\n",
778  bit->type == XBIT_error ? "Error" : "Warning",
779  bit->error_message);
780 
781 
782  for(s=p->source; s; s=s->parent)
783  {
784  if(s->entity->name)
785  Fprintf(Stderr, " in entity \"%S\"", s->entity->name);
786  else
787  Fprintf(Stderr, " in unnamed entity");
788 
789  switch(SourceLineAndChar(s, &linenum, &charnum))
790  {
791  case 1:
792  Fprintf(Stderr, " at line %d char %d of", linenum+1, charnum+1);
793  break;
794  case 0:
795  Fprintf(Stderr, " defined at line %d char %d of",
796  linenum+1, charnum+1);
797  break;
798  case -1:
799  Fprintf(Stderr, " defined in");
800  break;
801  }
802 
803  Fprintf(Stderr, " %s\n", EntityDescription(s->entity));
804  }
805 }
806 
807 
808 static int parse(Parser p)
809 {
810  int c;
811  InputSource s;
812 
813  if(p->state == PS_end || p->state == PS_error)
814  {
815  /* After an error or EOF, just keep returning EOF */
816  p->xbit.type = XBIT_eof;
817  return 0;
818  }
819 
820  clear_xbit(&p->xbit);
821 
822  if(p->state <= PS_prolog2 || p->state == PS_epilog)
823  skip_whitespace(p->source);
824 
825 restart:
826  pop_while_at_eoe(p);
827  s = p->source;
828  SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
829 
830  switch(c = get(s))
831  {
832  case XEOE:
833  if(p->state != PS_epilog)
834  return error(p, "Document ends too soon");
835  p->state = PS_end;
836  p->xbit.type = XBIT_eof;
837  return 0;
838  case '<':
839  return parse_markup(p);
840  case '&':
841  if(ParserGetFlag(p, IgnoreEntities))
842  goto pcdata;
843  if(p->state <= PS_prolog2)
844  return error(p, "Entity reference not allowed in prolog");
845  if(looking_at(p, "#"))
846  {
847  /* a character reference - go back and parse as pcdata */
848  unget(s);
849  goto pcdata;
850  }
851  if(ParserGetFlag(p, ExpandGeneralEntities))
852  {
853  /* an entity reference - push it and start again */
854  require(parse_reference(p, 0, 1, 1));
855  goto restart;
856  }
857  /* not expanding general entities, so treat as pcdata */
858  goto pcdata;
859  default:
860  pcdata:
861  unget(s);
862  return parse_pcdata(p);
863  }
864 }
865 
866 /* Called after reading '<' */
867 
868 static int parse_markup(Parser p)
869 {
870  InputSource s = p->source;
871  int c = get(s);
872 
873  switch(c)
874  {
875  case '!':
876  if(looking_at(p, "--"))
877  {
878  if(ParserGetFlag(p, ReturnComments))
879  return parse_comment(p, 0);
880  else
881  {
882  require(parse_comment(p, 1));
883  return parse(p);
884  }
885  }
886  else if(looking_at(p, "DOCTYPE "))
887  return parse_dtd(p);
888  else if(looking_at(p, "[CDATA["))
889  return parse_cdata(p);
890  else
891  return error(p, "Syntax error after <!");
892 
893  case '/':
894  return parse_endtag(p);
895 
896  case '?':
897  return parse_pi(p);
898 
899  default:
900  unget(s);
901  if(!ParserGetFlag(p, XMLLessThan) &&
902  (c == XEOE || !is_xml_namestart(c)))
903  {
904  /* In nSGML, recognise < as stago only if followed by namestart */
905 
906  unget(s); /* put back the < */
907  return parse_pcdata(p);
908  }
909  return parse_starttag(p);
910  }
911 }
912 
913 static int parse_endtag(Parser p)
914 {
915  ElementDefinition def;
916  Entity ent;
917 
918  p->xbit.type = XBIT_end;
919  require(parse_name(p, "after </"));
920  maybe_uppercase_name(p);
921 
922  if(ParserGetFlag(p, CheckEndTagsMatch))
923  {
924  if(p->element_depth <= 0)
925  return error(p, "End tag </%.*S> outside of any element",
926  p->namelen, p->name);
927 
928  ent = p->element_stack[--p->element_depth].entity;
929  def = p->element_stack[p->element_depth].definition;
930 
931  if(p->namelen == def->namelen &&
932  memcmp(p->name, def->name, p->namelen * sizeof(Char)) == 0)
933  p->xbit.element_definition = def;
934  else
935  return error(p, "Mismatched end tag: expected </%S>, got </%.*S>",
936  def->name, p->namelen, p->name);
937 
938  if(ent != p->source->entity)
939  return error(p, "Element ends in different entity from that "
940  "in which it starts");
941 
942  if(p->element_depth == 0)
943  p->state = PS_epilog;
944  }
945  else
946  {
947  p->xbit.element_definition = FindElementN(p->dtd, p->name, p->namelen);
948  if(!p->xbit.element_definition)
949  return error(p, "End tag for unknown element %.*S",
950  p->namelen, p->name);
951  }
952 
953  skip_whitespace(p->source);
954  return expect(p, '>', "after name in end tag");
955 }
956 
957 static int parse_starttag(Parser p)
958 {
959  int c;
960 
961  if(p->state == PS_epilog && !ParserGetFlag(p, AllowMultipleElements))
962  return error(p, "Document contains multiple elements");
963 
964  p->state = PS_body;
965 
966  require(parse_name(p, "after <"));
967  maybe_uppercase_name(p);
968 
969  p->xbit.element_definition = FindElementN(p->dtd, p->name, p->namelen);
970  if(!p->xbit.element_definition || p->xbit.element_definition->tentative)
971  {
972  if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedElements))
973  return error(p, "Start tag for undeclared element %.*S",
974  p->namelen, p->name);
975  if(p->have_dtd && ParserGetFlag(p, WarnOnUndefinedElements))
976  warn(p, "Start tag for undeclared element %.*S; "
977  "declaring it to have content ANY",
978  p->namelen, p->name);
979  if(p->xbit.element_definition)
980  RedefineElement(p->xbit.element_definition, CT_any, 0);
981  else
982  {
983  if(!(p->xbit.element_definition =
984  DefineElementN(p->dtd, p->name, p->namelen, CT_any, 0)))
985  return error(p, "System error");
986  }
987  }
988 
989  while(1)
990  {
991  InputSource s = p->source;
992 
993  /* We could just do skip_whitespace here, but we will get a
994  better error message if we look a bit closer. */
995 
996  c = get(s);
997  if(c !=XEOE && is_xml_whitespace(c))
998  {
999  skip_whitespace(s);
1000  c = get(s);
1001  }
1002  else if(c != '>' &&
1003  !(ParserGetFlag(p, XMLEmptyTagEnd) && c == '/'))
1004  {
1005  unget(s); /* For error position */
1006  return error(p, "Expected whitespace or tag end in start tag");
1007  }
1008 
1009  if(c == '>')
1010  {
1011  p->xbit.type = XBIT_start;
1012  break;
1013  }
1014 
1015  if((ParserGetFlag(p, XMLEmptyTagEnd)) && c == '/')
1016  {
1017  require(expect(p, '>', "after / in start tag"));
1018  p->xbit.type = XBIT_empty;
1019  break;
1020  }
1021 
1022  unget(s);
1023 
1024  require(parse_attribute(p));
1025  }
1026 
1027  if(ParserGetFlag(p, CheckEndTagsMatch))
1028  {
1029  if(p->xbit.type == XBIT_start)
1030  {
1031  if(p->element_depth == p->element_stack_alloc)
1032  {
1033  p->element_stack_alloc =
1034  p->element_stack_alloc == 0 ? 20 :
1035  p->element_stack_alloc * 2;
1036  if(!(p->element_stack =
1037  Realloc(p->element_stack,
1038  (p->element_stack_alloc * sizeof(*p->element_stack)))))
1039  return error(p, "System error");
1040  }
1041  p->element_stack[p->element_depth].definition =
1042  p->xbit.element_definition;
1043  p->element_stack[p->element_depth++].entity = p->source->entity;
1044  }
1045  else
1046  if(p->element_depth == 0)
1047  p->state = PS_epilog;
1048  }
1049 
1050  if(ParserGetFlag(p, ReturnDefaultedAttributes))
1051  {
1052  AttributeDefinition d;
1053  Attribute a;
1054 
1055  for(d=NextAttributeDefinition(p->xbit.element_definition, 0);
1056  d;
1057  d=NextAttributeDefinition(p->xbit.element_definition, d))
1058  {
1059  if(!d->default_value)
1060  continue;
1061  for(a=p->xbit.attributes; a; a=a->next)
1062  if(a->definition == d)
1063  break;
1064  if(!a)
1065  {
1066  if(!(a = Malloc(sizeof(*a))))
1067  return error(p, "System error");
1068  a->definition = d;
1069  if(!(a->value = Strdup(d->default_value)))
1070  return error(p, "System error");
1071  a->quoted = 1;
1072  a->next = p->xbit.attributes;
1073  p->xbit.attributes = a;
1074  }
1075  }
1076  }
1077 
1078  return 0;
1079 }
1080 
1081 static int parse_attribute(Parser p)
1082 {
1083  InputSource s = p->source;
1084  AttributeDefinition def;
1085  struct attribute *a;
1086  int c;
1087 
1088  require(parse_name(p, "for attribute"));
1089  maybe_uppercase_name(p);
1090 
1091  def = FindAttributeN(p->xbit.element_definition, p->name, p->namelen);
1092  if(!def)
1093  {
1094  if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedAttributes))
1095  return error(p, "Undeclared attribute %.*S for element %S",
1096  p->namelen, p->name, p->xbit.element_definition->name);
1097  if(p->have_dtd && ParserGetFlag(p, WarnOnUndefinedAttributes))
1098  warn(p, "Undeclared attribute %.*S for element %S; "
1099  "declaring it as CDATA #IMPLIED",
1100  p->namelen, p->name, p->xbit.element_definition->name);
1101  if(!(def = DefineAttributeN(p->xbit.element_definition,
1102  p->name, p->namelen,
1103  AT_cdata, 0, DT_implied, 0)))
1104  return error(p, "System error");
1105  }
1106 
1107  for(a = p->xbit.attributes; a; a = a->next)
1108  if(a->definition == def)
1109  return error(p, "Repeated attribute %.*S", p->namelen, p->name);
1110 
1111  if(!(a = Malloc(sizeof(*a))))
1112  return error(p, "System error");
1113 
1114  a->value = 0; /* in case of error */
1115  a->next = p->xbit.attributes;
1116  p->xbit.attributes = a;
1117  a->definition = def;
1118 
1119  skip_whitespace(s);
1120  require(expect(p, '=', "after attribute name"));
1121 
1122  skip_whitespace(s);
1123  c = get(s);
1124  unget(s);
1125  switch(c)
1126  {
1127  case '"':
1128  case '\'':
1129  a->quoted = 1;
1130  require(parse_string(p, "in attribute value",
1131  a->definition->type == AT_cdata ? LT_cdata_attr :
1132  LT_tok_attr));
1133  a->value = p->pbuf;
1134  Consume(p->pbuf);
1135  break;
1136  default:
1137  if(ParserGetFlag(p, ErrorOnUnquotedAttributeValues))
1138  return error(p, "Value of attribute is unquoted");
1139  a->quoted = 0;
1140  require(parse_nmtoken(p, "in unquoted attribute value"));
1141  CopyName(a->value);
1142  break;
1143  }
1144 
1145  return 0;
1146 }
1147 
1148 static int transcribe(Parser p, int back, int count)
1149 {
1150  ExpandBuf(p->pbuf, p->pbufnext + count);
1151  memcpy(p->pbuf + p->pbufnext,
1152  p->source->line + p->source->next - back,
1153  count * sizeof(Char));
1154  p->pbufnext += count;
1155  return 0;
1156 }
1157 
1158 /* Called after pushing back the first character of the pcdata */
1159 
1160 static int parse_pcdata(Parser p)
1161 {
1162  int count = 0;
1163  InputSource s;
1164  Char *buf;
1165  int next, buflen;
1166 
1167  if(p->state <= PS_prolog2)
1168  return error(p, "Character data not allowed in prolog");
1169  if(p->state == PS_epilog)
1170  return error(p, "Character data not allowed after body");
1171 
1172  s = p->source;
1173  buf = s->line;
1174  next = s->next;
1175  buflen = s->line_length;
1176 
1177  p->pbufnext = 0;
1178 
1179  while(1)
1180  {
1181  if(next == buflen)
1182  {
1183  s->next = next;
1184  if(count > 0)
1185  {
1186  require(transcribe(p, count, count));
1187  }
1188  count = 0;
1189  if(at_eoe(s))
1190  {
1191  if(!ParserGetFlag(p, MergePCData))
1192  goto done;
1193  else
1194  pop_while_at_eoe(p);
1195  }
1196  s = p->source;
1197  buf = s->line;
1198  next = s->next;
1199  buflen = s->line_length;
1200  if(next == buflen)
1201  goto done; /* must be EOF */
1202  }
1203 
1204  switch(buf[next++])
1205  {
1206  case '<':
1207  if(!ParserGetFlag(p, XMLLessThan))
1208  {
1209  /* In nSGML, don't recognise < as markup unless it looks ok */
1210  if(next == buflen)
1211  goto deflt;
1212  if(buf[next] != '!' && buf[next] != '/' && buf[next] != '?' &&
1213  !is_xml_namestart(buf[next]))
1214  goto deflt;
1215  }
1216  s->next = next;
1217  if(count > 0)
1218  {
1219  require(transcribe(p, count+1, count));
1220  }
1221  count = 0;
1222  if(!ParserGetFlag(p, ReturnComments) &&
1223  buflen >= next + 3 &&
1224  buf[next] == '!' && buf[next+1] == '-' && buf[next+2] == '-')
1225  {
1226  s->next = next + 3;
1227  require(parse_comment(p, 1));
1228  buflen = s->line_length;
1229  next = s->next;
1230  }
1231  else
1232  {
1233  s->next = next-1;
1234  goto done;
1235  }
1236  break;
1237  case '&':
1238  if(ParserGetFlag(p, IgnoreEntities))
1239  goto deflt;
1240  if(!ParserGetFlag(p, MergePCData) &&
1241  (p->pbufnext > 0 || count > 0))
1242  {
1243  /* We're returning references as separate bits, and we've
1244  come to one, and we've already got some data to return,
1245  so return what we've got and get the reference next time. */
1246 
1247  s->next = next-1;
1248  if(count > 0)
1249  {
1250  require(transcribe(p, count, count));
1251  }
1252  goto done;
1253  }
1254  if(buflen >= next+1 && buf[next] == '#')
1255  {
1256  /* It's a character reference */
1257 
1258  s->next = next+1;
1259  if(count > 0)
1260  {
1261  require(transcribe(p, count+2, count));
1262  }
1263  count = 0;
1264  require(parse_character_reference(p,
1265  ParserGetFlag(p, ExpandCharacterEntities)));
1266  next = s->next;
1267 
1268  if(!ParserGetFlag(p, MergePCData))
1269  goto done;
1270  }
1271  else
1272  {
1273  /* It's a general entity reference */
1274 
1275  s->next = next;
1276  if(count > 0)
1277  {
1278  require(transcribe(p, count+1, count));
1279  }
1280  count = 0;
1281  require(parse_reference(p, 0,
1282  ParserGetFlag(p, ExpandGeneralEntities),
1283  1));
1284  s = p->source;
1285  buf = s->line;
1286  buflen = s->line_length;
1287  next = s->next;
1288 
1289  if(!ParserGetFlag(p, MergePCData))
1290  goto done;
1291  }
1292  break;
1293  case ']':
1294  if(ParserGetFlag(p, XMLMiscWFErrors) &&
1295  buflen >= next + 2 &&
1296  buf[next] == ']' && buf[next+1] == '>')
1297  return error(p, "Illegal character sequence ']]>' in pcdata");
1298  /* fall through */
1299  default:
1300  deflt:
1301  count++;
1302  break;
1303  }
1304  }
1305 
1306  done:
1307  p->pbuf[p->pbufnext++] = 0;
1308  p->xbit.type = XBIT_pcdata;
1309  p->xbit.pcdata_chars = p->pbuf;
1310  Consume(p->pbuf);
1311 
1312  return 0;
1313 }
1314 
1315 /* Called after reading '<!--'. Won't go over an entity end. */
1316 
1317 static int parse_comment(Parser p, int skip)
1318 {
1319  InputSource s = p->source;
1320  int c, c1=0, c2=0;
1321  int count = 0;
1322 
1323  if(!skip)
1324  p->pbufnext = 0;
1325 
1326  while((c = get(s)) != XEOE)
1327  {
1328  count++;
1329  if(c1 == '-' && c2 == '-')
1330  {
1331  if(c == '>')
1332  break;
1333  unget(s); /* For error position */
1334  return error(p, "-- in comment");
1335  }
1336 
1337  if(at_eol(s))
1338  {
1339  if(!skip)
1340  {
1341  require(transcribe(p, count, count));
1342  }
1343  count = 0;
1344  }
1345  c2 = c1; c1 = c;
1346  }
1347 
1348  if(c == XEOE)
1349  return error(p, "EOE in comment");
1350 
1351  if(skip)
1352  return 0;
1353 
1354  require(transcribe(p, count, count-3));
1355  p->pbuf[p->pbufnext++] = 0;
1356  p->xbit.type = XBIT_comment;
1357  p->xbit.comment_chars = p->pbuf;
1358  Consume(p->pbuf);
1359 
1360  return 0;
1361 }
1362 
1363 static int parse_pi(Parser p)
1364 {
1365  InputSource s = p->source;
1366  int c, c1=0;
1367  int count = 0;
1368  Char xml[] = {'x', 'm', 'l', 0};
1369 
1370  require(parse_name(p, "after <?"));
1371  CopyName(p->xbit.pi_name);
1372 
1373  p->pbufnext = 0;
1374 
1375  if(Strcasecmp(p->xbit.pi_name, xml) == 0)
1376  {
1377  if(ParserGetFlag(p, XMLStrictWFErrors))
1378  return error(p, "Misplaced or wrong-case xml declaration");
1379  else
1380  warn(p, "Misplaced or wrong-case xml declaration; treating as PI");
1381  }
1382 
1383  /* Empty PI? */
1384 
1385  if(looking_at(p, ParserGetFlag(p, XMLPiEnd) ? "?>" : ">"))
1386  {
1387  ExpandBuf(p->pbuf, 0);
1388  goto done;
1389  }
1390 
1391  /* If non-empty, must be white space after name */
1392 
1393  c = get(s);
1394  if(c == XEOE || !is_xml_whitespace(c))
1395  return error(p, "Expected whitespace after PI name");
1396  skip_whitespace(s);
1397 
1398  while((c = get(s)) != XEOE)
1399  {
1400  count++;
1401  if(c == '>' &&
1402  (!ParserGetFlag(p, XMLPiEnd) || c1 == '?'))
1403  break;
1404  if(at_eol(s))
1405  {
1406  require(transcribe(p, count, count));
1407  count = 0;
1408  }
1409  c1 = c;
1410  }
1411 
1412  if(c == XEOE)
1413  return error(p, "EOE in PI");
1414 
1415  require(transcribe(p, count, count-(ParserGetFlag(p, XMLPiEnd) ? 2 : 1)));
1416 done:
1417  p->pbuf[p->pbufnext++] = 0;
1418  p->xbit.type = XBIT_pi;
1419  p->xbit.pi_chars = p->pbuf;
1420  Consume(p->pbuf);
1421 
1422  return 0;
1423 }
1424 
1425 static int parse_string(Parser p, const char8 *where, enum literal_type type)
1426 {
1427  int c, quote;
1428  int count = 0;
1429  InputSource start_source, s;
1430 
1431  s = start_source = p->source;
1432 
1433  quote = get(s);
1434  if(quote != '\'' && quote != '"')
1435  {
1436  unget(s); /* For error position */
1437  return error(p, "Expected quoted string %s, but got %s",
1438  where, escape(quote));
1439  }
1440 
1441  p->pbufnext = 0;
1442 
1443  while(1)
1444  {
1445  switch(c = get(s))
1446  {
1447  case '\r':
1448  case '\n':
1449  case '\t':
1450  if(type == LT_plain || type == LT_entity ||
1451  !ParserGetFlag(p, NormaliseAttributeValues))
1452  {
1453  count++;
1454  break;
1455  }
1456  if(count > 0)
1457  {
1458  require(transcribe(p, count+1, count));
1459  }
1460  count = 0;
1461  ExpandBuf(p->pbuf, p->pbufnext+1);
1462  p->pbuf[p->pbufnext++] = ' ';
1463  break;
1464 
1465  case '<':
1466  if((type == LT_tok_attr || type == LT_cdata_attr) &&
1467  ParserGetFlag(p, XMLMiscWFErrors))
1468  return error(p, "Illegal character '<' %s", where);
1469  count++;
1470  break;
1471 
1472  case XEOE:
1473  if(s == start_source)
1474  {
1475  return error(p, "Quoted string goes past entity end");
1476  }
1477  if(count > 0)
1478  {
1479  require(transcribe(p, count, count));
1480  }
1481  count = 0;
1482  ParserPop(p);
1483  s = p->source;
1484  break;
1485 
1486  case '%':
1487  if(type != LT_entity)
1488  {
1489  count++;
1490  break;
1491  }
1492  if(count > 0)
1493  {
1494  require(transcribe(p, count+1, count));
1495  }
1496  count = 0;
1497  if(p->external_pe_depth == 0)
1498  {
1499  unget(s); /* For error position */
1500  return error(p, "PE ref not allowed here in internal subset");
1501  }
1502  require(parse_reference(p, 1, 1, 1));
1503  s = p->source;
1504  break;
1505 
1506  case '&':
1507  if(ParserGetFlag(p, IgnoreEntities))
1508  goto deflt;
1509  if(type == LT_plain)
1510  {
1511  count++;
1512  break;
1513  }
1514 
1515  if(count > 0)
1516  {
1517  require(transcribe(p, count+1, count));
1518  }
1519  count = 0;
1520  if(looking_at(p, "#"))
1521  require(parse_character_reference(p,
1522  ParserGetFlag(p, ExpandCharacterEntities)));
1523  else
1524  {
1525  require(parse_reference(p, 0,
1526  type != LT_entity &&
1527  ParserGetFlag(p, ExpandGeneralEntities),
1528  !ParserGetFlag(p, XMLMiscWFErrors)));
1529  s = p->source;
1530  }
1531  break;
1532 
1533  default:
1534  deflt:
1535  if(c == quote && p->source == start_source)
1536  goto done;
1537  count++;
1538  }
1539 
1540  if(at_eol(s) && count > 0)
1541  {
1542  require(transcribe(p, count, count));
1543  count = 0;
1544  }
1545  }
1546 
1547 done:
1548  if(count > 0)
1549  require(transcribe(p, count+1, count));
1550  else
1551  ExpandBuf(p->pbuf, p->pbufnext+1);
1552  p->pbuf[p->pbufnext++] = 0;
1553 
1554  if(ParserGetFlag(p, NormaliseAttributeValues) && type == LT_tok_attr)
1555  {
1556  Char *old, *new;
1557 
1558  new = old = p->pbuf;
1559 
1560  /* Maybe skip leading whitespace */
1561 
1562  while(*old == ' ')
1563  old++;
1564 
1565  /* Translate whitespace to spaces, maybe compressing */
1566 
1567  for( ; *old; old++)
1568  {
1569  if(*old == ' ')
1570  {
1571  /* NB can't be at start because we skipped whitespace */
1572  if(type == LT_tok_attr && new[-1] == ' ')
1573  ;
1574  else
1575  *new++ = ' ';
1576  }
1577  else
1578  *new++ = *old;
1579  }
1580 
1581  /* Maybe trim trailing space (only one possible) */
1582 
1583  if(new > p->pbuf && new[-1] == ' ')
1584  new--;
1585 
1586  *new = 0;
1587  }
1588 
1589  return 0;
1590 }
1591 
1592 static int parse_dtd(Parser p)
1593 {
1594  InputSource s = p->source;
1595  Entity parent = s->entity;
1596  Entity internal_part = 0, external_part = 0;
1597  Char *name;
1598  char8 *publicid = 0, *systemid = 0;
1599  struct xbit xbit;
1600 
1601  xbit = p->xbit; /* copy start position */
1602  xbit.type = XBIT_dtd;
1603 
1604  require(parse_name(p, "for name in dtd"));
1605  CopyName(name);
1606  maybe_uppercase(p, name);
1607 
1608  skip_whitespace(s);
1609 
1610  require(parse_external_id(p, 0, &publicid, &systemid,
1611  ParserGetFlag(p, XMLExternalIDs),
1612  ParserGetFlag(p, XMLExternalIDs)));
1613 
1614  if(systemid || publicid)
1615  {
1616  external_part = NewExternalEntity(0, publicid, systemid, 0, parent);
1617  if(!external_part)
1618  {
1619  Free(name);
1620  return error(p, "System error");
1621  }
1622  skip_whitespace(s);
1623  }
1624 
1625  if(looking_at(p, "["))
1626  {
1627  int line = s->line_number, cpos = s->next;
1628 
1629  require(read_markupdecls(p));
1630  skip_whitespace(s);
1631  internal_part = NewInternalEntity(0, p->pbuf, parent, line, cpos, 1);
1632  Consume(p->pbuf);
1633  if(!internal_part)
1634  {
1635  Free(name);
1636  FreeEntity(external_part);
1637  return error(p, "System error");
1638  }
1639  }
1640 
1641  require(expect(p, '>', "at end of dtd"));
1642 
1643  if(p->state == PS_prolog1)
1644  p->state = PS_prolog2;
1645  else
1646  {
1647  Free(name);
1648  FreeEntity(external_part);
1649  FreeEntity(internal_part);
1650 
1651  if(ParserGetFlag(p, XMLStrictWFErrors))
1652  return error(p, "Misplaced or repeated DOCTYPE declaration");
1653 
1654  warn(p, "Misplaced or repeated DOCTYPE declaration");
1655  /* Ignore it and return the next bit */
1656  return parse(p);
1657  }
1658 
1659  if(p->dtd->name)
1660  {
1661  Free(name);
1662  FreeEntity(external_part);
1663  FreeEntity(internal_part);
1664 
1665  /* This happens if we manually set the dtd */
1666  return parse(p);
1667  }
1668 
1669  p->dtd->name = name;
1670  p->dtd->internal_part = internal_part;
1671  p->dtd->external_part = external_part;
1672 
1673  if(ParserGetFlag(p, TrustSDD))
1674  {
1675  if(internal_part)
1676  {
1677  ParseDtd(p, internal_part);
1678  if(p->xbit.type == XBIT_error)
1679  return -1;
1680  }
1681  if(external_part && p->standalone != SDD_yes)
1682  {
1683  ParseDtd(p, external_part);
1684  if(p->xbit.type == XBIT_error)
1685  return -1;
1686  }
1687  }
1688 
1689  p->xbit = xbit;
1690  return 0;
1691 }
1692 
1693 static int read_markupdecls(Parser p)
1694 {
1695  InputSource s = p->source;
1696  int depth=1;
1697  int c, d, hyphens=0;
1698  int count = 0;
1699 
1700  p->pbufnext = 0;
1701 
1702  while(1)
1703  {
1704  c = get(s);
1705  if(c == XEOE)
1706  return error(p, "EOE in DTD");
1707  if(c == '-')
1708  hyphens++;
1709  else
1710  hyphens = 0;
1711 
1712  count++;
1713 
1714  switch(c)
1715  {
1716  case ']':
1717  if(--depth == 0)
1718  {
1719  count--; /* We don't want the final ']' */
1720  require(transcribe(p, count+1, count));
1721  p->pbuf[p->pbufnext++] = 0;
1722  return 0;
1723  }
1724  break;
1725 
1726  case '[':
1727  depth++;
1728  break;
1729 
1730  case '"':
1731  case '\'':
1732  while((d = get(s)) != XEOE)
1733  {
1734  count++;
1735  if(at_eol(s))
1736  {
1737  require(transcribe(p, count, count));
1738  count = 0;
1739  }
1740  if(d == c)
1741  break;
1742  }
1743  if(d == XEOE)
1744  return error(p, "EOE in DTD");
1745  break;
1746 
1747  case '-':
1748  if(hyphens < 2)
1749  break;
1750  hyphens = 0;
1751  while((d = get(s)) != XEOE)
1752  {
1753  count++;
1754  if(at_eol(s))
1755  {
1756  require(transcribe(p, count, count));
1757  count = 0;
1758  }
1759  if(d == '-')
1760  hyphens++;
1761  else
1762  hyphens = 0;
1763  if(hyphens == 2)
1764  break;
1765  }
1766  if(d == XEOE)
1767  return error(p, "EOE in DTD");
1768  hyphens = 0;
1769  break;
1770 
1771  default:
1772  break;
1773  }
1774 
1775  if(at_eol(s) && count > 0)
1776  {
1777  require(transcribe(p, count, count));
1778  count = 0;
1779  }
1780  }
1781 }
1782 
1783 static int process_nsl_decl(Parser p)
1784 {
1785  InputSource s = p->source;
1786  int c, count = 0;
1787 
1788  s->entity->ml_decl = ML_nsl;
1789 
1790  /* The default character encoding for nSGML files is ascii-ash */
1791  if(s->entity->encoding == CE_UTF_8)
1792  s->entity->encoding = CE_unspecified_ascii_superset;
1793 
1794  /* Syntax is <?NSL DDB unquoted-filename 0> */
1795 
1796  if(!looking_at(p, "DDB "))
1797  return error(p, "Expected \"DDB\" in NSL declaration");
1798 
1799  while(c = get(s), !is_xml_whitespace(c))
1800  switch(c)
1801  {
1802  case XEOE:
1803  return error(p, "EOE in NSL declaration");
1804 
1805  case '>':
1806  return error(p, "Syntax error in NSL declaration");
1807 
1808  default:
1809  count++;
1810  }
1811 
1812  p->pbufnext = 0;
1813  require(transcribe(p, count+1, count));
1814  p->pbuf[p->pbufnext++] = 0;
1815 
1816  skip_whitespace(s);
1817  if(!looking_at(p, "0>"))
1818  return error(p, "Expected \"0>\" at end of NSL declaration");
1819 
1820  if(!(s->entity->ddb_filename = strdup8(Chartochar8(p->pbuf))))
1821  return error(p, "System error");
1822 
1823  return 0;
1824 }
1825 
1826 static int process_xml_decl(Parser p)
1827 {
1828  InputSource s = p->source;
1829  enum {None, V, E, S} which, last = None;
1830  Char *Value, *cp;
1831  char8 *value;
1832  CharacterEncoding enc = CE_unknown;
1833  Char c;
1834 
1835  s->entity->ml_decl = ML_xml;
1836 
1837  /* XXX Should save the string buffer because it may already be in use */
1838 
1839  while(!looking_at(p, "?>"))
1840  {
1841  if(looking_at(p, "version"))
1842  which = V;
1843  else if(looking_at(p, "encoding"))
1844  which = E;
1845  else if(looking_at(p, "standalone"))
1846  which = S;
1847  else
1848  return error(p, "Expected \"version\", \"encoding\" or "
1849  "\"standalone\" in XML declaration");
1850 
1851  if(which <= last)
1852  {
1853  if(ParserGetFlag(p, XMLStrictWFErrors))
1854  return error(p, "Repeated or misordered attributes "
1855  "in XML declaration");
1856  warn(p, "Repeated or misordered attributes in XML declaration");
1857  }
1858  last = which;
1859 
1860  skip_whitespace(s);
1861  require(expect(p, '=', "after attribute name in XML declaration"));
1862  skip_whitespace(s);
1863 
1864  require(parse_string(p, "for attribute value in XML declaration",
1865  LT_plain));
1866 
1867  maybe_uppercase(p, p->pbuf);
1868  Value = p->pbuf;
1869 
1870  if(which == E)
1871  {
1872  if(!is_ascii_alpha(Value[0]))
1873  return error(p, "Encoding name does not begin with letter");
1874  for(cp=Value+1; *cp; cp++)
1875  if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
1876  *cp != '.' && *cp != '_' && *cp != '-')
1877  return error(p, "Illegal character %s in encoding name",
1878  escape(*cp));
1879 
1880  value = Chartochar8(Value);
1881 
1882  enc = FindEncoding(value);
1883  if(enc == CE_unknown)
1884  return error(p, "Unknown declared encoding %s", value);
1885 
1886  if(EncodingsCompatible(p->source->entity->encoding, enc, &enc))
1887  {
1888 #if CHAR_SIZE == 8
1889  /* We ignore the declared encoding in 8-bit mode,
1890  and treat it as a random ascii superset. */
1891 #else
1892  p->source->entity->encoding = enc;
1893 #endif
1894  }
1895  else
1896  return error(p, "Declared encoding %s is incompatible with %s "
1897  "which was used to read it",
1898  CharacterEncodingName[enc],
1899  CharacterEncodingName[p->source->entity->encoding]);
1900 
1901  s->entity->encoding_decl = enc;
1902  }
1903 
1904  if(which == S)
1905  {
1906  value = Chartochar8(Value);
1907 
1908  if(str_maybecase_cmp8(p, value, "no") == 0)
1909  p->standalone = SDD_no;
1910  else if(str_maybecase_cmp8(p, value, "yes") == 0)
1911  p->standalone = SDD_yes;
1912  else
1913  return error(p, "Expected \"yes\" or \"no\" "
1914  "for standalone in XML declaration");
1915 
1916  s->entity->standalone_decl = p->standalone;
1917  }
1918 
1919  if(which == V)
1920  {
1921  for(cp=Value; *cp; cp++)
1922  if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
1923  *cp != '.' && *cp != '_' && *cp != '-' && *cp != ':')
1924  return error(p, "Illegal character %s in version number",
1925  escape(*cp));
1926 
1927  if(!s->entity->version_decl)
1928  if(!(s->entity->version_decl = strdup8(Chartochar8(Value))))
1929  return error(p, "System error");
1930  }
1931 
1932  c = get(s);
1933  if(c == '?')
1934  unget(s);
1935  else if(!is_xml_whitespace(c))
1936  return error(p, "Expected whitespace or \"?>\" after attribute "
1937  "in XML declaration");
1938  skip_whitespace(s);
1939  }
1940  return 0;
1941 }
1942 
1943 static int parse_cdata(Parser p)
1944 {
1945  InputSource s = p->source;
1946  int c, c1=0, c2=0;
1947  int count = 0;
1948 
1949  if(p->state <= PS_prolog2)
1950  return error(p, "Cdata section not allowed in prolog");
1951  if(p->state == PS_epilog)
1952  return error(p, "Cdata section not allowed after body");
1953 
1954  p->pbufnext = 0;
1955 
1956  while((c = get(s)) != XEOE)
1957  {
1958  count++;
1959  if(c == '>' && c1 == ']' && c2 == ']')
1960  break;
1961  if(at_eol(s))
1962  {
1963  require(transcribe(p, count, count));
1964  count = 0;
1965  }
1966  c2 = c1; c1 = c;
1967  }
1968 
1969  if(c == XEOE)
1970  return error(p, "EOE in CData section");
1971 
1972  require(transcribe(p, count, count-3));
1973  p->pbuf[p->pbufnext++] = 0;
1974  p->xbit.type = XBIT_cdsect;
1975  p->xbit.cdsect_chars = p->pbuf;
1976  Consume(p->pbuf);
1977 
1978  return 0;
1979 }
1980 
1981 XBit ParseDtd(Parser p, Entity e)
1982 {
1983  InputSource source, save;
1984 
1985  if(e->type == ET_external && p->entity_opener)
1986  source = p->entity_opener(e, p->callback_arg);
1987  else
1988  source = EntityOpen(e);
1989  if(!source)
1990  {
1991  error(p, "Couldn't open dtd entity %s", EntityDescription(e));
1992  return &p->xbit;
1993  }
1994 
1995  save = p->source;
1996  p->source = 0;
1997  if(ParserPush(p, source) == -1)
1998  return &p->xbit;
1999 
2000  p->have_dtd = 1;
2001 
2002  p->external_pe_depth = (source->entity->type == ET_external);
2003 
2004  while(parse_markupdecl(p) == 0)
2005  ;
2006 
2007  p->external_pe_depth = 0;
2008 
2009  /* don't restore after error, so user can call ParserPerror */
2010  if(p->xbit.type != XBIT_error)
2011  {
2012  ParserPop(p); /* to free the input source */
2013  p->source = save;
2014  }
2015 
2016  return &p->xbit;
2017 }
2018 
2019 /*
2020  * Returns 0 normally, -1 if error, 1 at EOF.
2021  */
2022 static int parse_markupdecl(Parser p)
2023 {
2024  InputSource s;
2025  int c;
2026  int cur_line, cur_char;
2027  Entity cur_ent;
2028 
2029  if(p->state == PS_error)
2030  return error(p, "Attempt to continue reading DTD after error");
2031 
2032  clear_xbit(&p->xbit);
2033 
2034  require(skip_dtd_whitespace(p, 1)); /* allow PE even in internal subset */
2035  s = p->source;
2036  SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
2037 
2038  cur_ent = s->entity;
2039  cur_line = s->line_number;
2040  cur_char = s->next;
2041 
2042  c = get(s);
2043  switch(c)
2044  {
2045  case XEOE:
2046  p->xbit.type = XBIT_none;
2047  return 1;
2048  case '<':
2049  if(looking_at(p, "!ELEMENT"))
2050  {
2051  require(expect_dtd_whitespace(p, "after ELEMENT"));
2052  return parse_element_decl(p);
2053  }
2054  else if(looking_at(p, "!ATTLIST"))
2055  {
2056  require(expect_dtd_whitespace(p, "after ATTLIST"));
2057  return parse_attlist_decl(p);
2058  }
2059  else if(looking_at(p, "!ENTITY"))
2060  {
2061  require(expect_dtd_whitespace(p, "after ENTITY"));
2062  return parse_entity_decl(p, cur_ent, cur_line, cur_char);
2063  }
2064  else if(looking_at(p, "!NOTATION"))
2065  {
2066  require(expect_dtd_whitespace(p, "after NOTATION"));
2067  return parse_notation_decl(p);
2068  }
2069  else if(looking_at(p, "!["))
2070  return parse_conditional(p);
2071  else if(looking_at(p, "?"))
2072  {
2073  require(parse_pi(p));
2074  if(p->dtd_callback)
2075  p->dtd_callback(&p->xbit, p->callback_arg);
2076  else
2077  FreeXBit(&p->xbit);
2078  return 0;
2079  }
2080  else if(looking_at(p, "!--"))
2081  {
2082  if(ParserGetFlag(p, ReturnComments))
2083  {
2084  require(parse_comment(p, 0));
2085  if(p->dtd_callback)
2086  p->dtd_callback(&p->xbit, p->callback_arg);
2087  else
2088  FreeXBit(&p->xbit);
2089  return 0;
2090  }
2091  else
2092  return parse_comment(p, 1);
2093  }
2094  else
2095  return error(p, "Syntax error after < in dtd");
2096  default:
2097  unget(s); /* For error position */
2098  return error(p, "Expected \"<\" in dtd, but got %s", escape(c));
2099  }
2100 }
2101 
2102 static int parse_reference(Parser p, int pe, int expand, int allow_external)
2103 {
2104  Entity e;
2105  InputSource s;
2106 
2107  require(parse_name(p, pe ? "for parameter entity" : "for entity"));
2108  require(expect(p, ';', "after entity name"));
2109 
2110  if(!expand)
2111  return transcribe(p, 1 + p->namelen + 1, 1 + p->namelen + 1);
2112 
2113  e = FindEntityN(p->dtd, p->name, p->namelen, pe);
2114  if(!e)
2115  {
2116  Char *buf;
2117  Char *q;
2118  int i;
2119 
2120  if(pe || ParserGetFlag(p, ErrorOnUndefinedEntities))
2121  return error(p, "Undefined%s entity %.*S",
2122  pe ? " parameter" : "" ,
2123  p->namelen > 50 ? 50 : p->namelen, p->name);
2124 
2125  warn(p, "Undefined%s entity %.*S",
2126  pe ? " parameter" : "",
2127  p->namelen > 50 ? 50 : p->namelen, p->name);
2128 
2129  /* Fake a definition for it */
2130 
2131  buf = Malloc((5 + p->namelen + 1 + 1) * sizeof(Char));
2132  if(!buf)
2133  return error(p, "System error");
2134  q = buf;
2135  *q++ = '&'; *q++ = '#'; *q++ = '3'; *q++ = '8'; *q++ = ';';
2136  for(i=0; i<p->namelen; i++)
2137  *q++ = p->name[i];
2138  *q++ = ';';
2139  *q++ = 0;
2140 
2141  if(!(e = NewInternalEntityN(p->name, p->namelen, buf, 0, 0, 0, 0)))
2142  return error(p, "System error");
2143  if(!DefineEntity(p->dtd, e, 0))
2144  return error(p, "System error");
2145  }
2146 
2147  if(!allow_external && e->type == ET_external)
2148  return error(p, "Illegal reference to external entity");
2149 
2150  for(s = p->source; s; s = s->parent)
2151  if(s->entity == e)
2152  return error(p, "Recursive reference to entity \"%S\"", e->name);
2153 
2154  if(e->type == ET_external && p->entity_opener)
2155  s = p->entity_opener(e, p->callback_arg);
2156  else
2157  s = EntityOpen(e);
2158  if(!s)
2159  return error(p, "Couldn't open entity %S, %s",
2160  e->name, EntityDescription(e));
2161 
2162  require(ParserPush(p, s));
2163 
2164  return 0;
2165 }
2166 
2167 static int parse_character_reference(Parser p, int expand)
2168 {
2169  InputSource s = p->source;
2170  int c, base = 10;
2171  int count = 0;
2172  unsigned int code = 0;
2173  Char *ch = s->line + s->next;
2174 
2175  if(looking_at(p, "x"))
2176  {
2177  ch++;
2178  base = 16;
2179  }
2180 
2181  while((c = get(s)) != ';')
2182  {
2183  if((c >= '0' && c <= '9') ||
2184  (base == 16 && ((c >= 'A' && c <= 'F') ||
2185  (c >= 'a' && c <= 'f'))))
2186  count++;
2187  else
2188  {
2189  unget(s); /* For error position */
2190  return error(p,
2191  "Illegal character %s in base-%d character reference",
2192  escape(c), base);
2193  }
2194  }
2195 
2196  if(!expand)
2197  return transcribe(p, 2 + (base == 16) + count + 1,
2198  2 + (base == 16) + count + 1);
2199 
2200  while(count-- > 0)
2201  {
2202  c = *ch++;
2203  if(c >= '0' && c <= '9')
2204  code = code * base + (c - '0');
2205  else if(c >= 'A' && c <= 'F')
2206  code = code * base + 10 + (c - 'A');
2207  else
2208  code = code * base + 10 + (c - 'a');
2209  }
2210 
2211 #if CHAR_SIZE == 8
2212  if(code > 255 || !is_xml_legal(code))
2213  {
2214  if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
2215  return error(p, "0x%x is not a valid 8-bit XML character", code);
2216  else
2217  warn(p, "0x%x is not a valid 8-bit XML character; ignored", code);
2218  return 0;
2219  }
2220 #else
2221  if(!is_xml_legal(code))
2222  {
2223  if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
2224  return error(p, "0x%x is not a valid UTF-16 XML character", code);
2225  else
2226  warn(p, "0x%x is not a valid UTF-16 XML character; ignored", code);
2227  return 0;
2228  }
2229 
2230  if(code >= 0x10000)
2231  {
2232  /* Use surrogates */
2233 
2234  ExpandBuf(p->pbuf, p->pbufnext+2);
2235  code -= 0x10000;
2236 
2237  p->pbuf[p->pbufnext++] = (code >> 10) + 0xd800;
2238  p->pbuf[p->pbufnext++] = (code & 0x3ff) + 0xdc00;
2239 
2240  return 0;
2241  }
2242 #endif
2243 
2244  ExpandBuf(p->pbuf, p->pbufnext+1);
2245  p->pbuf[p->pbufnext++] = code;
2246 
2247  return 0;
2248 }
2249 
2250 /* Called after reading '<!ELEMENT ' */
2251 
2252 static int parse_element_decl(Parser p)
2253 {
2254  Char *name;
2255  ContentType type;
2256  ElementDefinition def;
2257 #if 1
2258  ContentParticle cp;
2259 #else
2260  int c;
2261  Char pcdata[] = {'#','P','C','D','A','T','A',0};
2262 #endif
2263  Char *content = 0;
2264 
2265  require(parse_name(p, "for name in element declaration"));
2266  CopyName(name);
2267  maybe_uppercase(p, name);
2268 
2269  require(expect_dtd_whitespace(p, "after name in element declaration"));
2270 
2271  if(looking_at(p, "EMPTY"))
2272  {
2273  type = CT_empty;
2274  content = 0;
2275  }
2276  else if(looking_at(p, "ANY"))
2277  {
2278  type = CT_any;
2279  content = 0;
2280  }
2281  else
2282 #if 1
2283  if(looking_at(p, "("))
2284  {
2285  unget(p->source);
2286  if(!(cp = parse_cp(p)) ||
2287  check_content_decl(p, cp) < 0 ||
2288  !(content = stringify_cp(cp)))
2289  {
2290  FreeContentParticle(cp);
2291  Free(content);
2292  Free(name);
2293  return -1;
2294  }
2295 
2296  if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
2297  type = CT_mixed;
2298  else
2299  type = CT_element;
2300  {
2301  }
2302  FreeContentParticle(cp); /* XXX */
2303  }
2304  else
2305  {
2306  Free(name);
2307  return error(p, "Expected \"EMPTY\", \"ANY\", or \"(\" after name in "
2308  "element declaration");
2309  }
2310 #else
2311  {
2312  /* Don't really parse here... maybe improve sometime */
2313 
2314  int count = 0;
2315 
2316  p->pbufnext = 0;
2317 
2318  while((c = get(p->source)) != '>')
2319  {
2320  switch(c)
2321  {
2322  case XEOE:
2323  if(count > 0)
2324  require(transcribe(p, count, count));
2325  if(!p->source->parent)
2326  return error(p, "EOE in element declaration");
2327  ParserPop(p);
2328  count = 0;
2329  break;
2330  case '%':
2331  if(count > 0)
2332  require(transcribe(p, count+1, count));
2333  if(p->external_pe_depth == 0)
2334  {
2335  unget(p->source); /* For error position */
2336  return error(p,
2337  "PE ref not allowed here in internal subset");
2338  }
2339  require(parse_reference(p, 1, 1, 1));
2340  count = 0;
2341  break;
2342  default:
2343  count++;
2344  if(at_eol(p->source))
2345  {
2346  require(transcribe(p, count, count));
2347  count = 0;
2348  }
2349  }
2350  }
2351 
2352  unget(p->source);
2353  require(transcribe(p, count, count));
2354  p->pbuf[p->pbufnext++] = 0;
2355 
2356  if(Strstr(p->pbuf, pcdata))
2357  type = CT_mixed;
2358  else
2359  type = CT_element;
2360 
2361  content = p->pbuf;
2362  Consume(p->pbuf);
2363  }
2364 #endif
2365  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2366  require(expect(p, '>', "at end of element declaration"));
2367 
2368  if((def = FindElement(p->dtd, name)))
2369  {
2370  if(def->tentative)
2371  RedefineElement(def, type, content);
2372  else
2373  {
2374  Free(content);
2375  if(ParserGetFlag(p, WarnOnRedefinitions))
2376  warn(p, "Ignoring redeclaration of element %S", name);
2377  }
2378  }
2379  else
2380  if (!DefineElement(p->dtd, name, type, content)) {
2381  return error(p, "System error");
2382  };
2383 
2384  Free(name);
2385 
2386  return 0;
2387 }
2388 
2389 /* Content model parsing */
2390 
2391 static ContentParticle parse_cp(Parser p)
2392 {
2393  ContentParticle cp;
2394 
2395  if(looking_at(p, "("))
2396  {
2397  if(!(cp = parse_choice_or_seq(p)))
2398  return 0;
2399  }
2400  else if(looking_at(p, "#PCDATA"))
2401  {
2402  if(!(cp = Malloc(sizeof(*cp))))
2403  {
2404  error(p, "System error");
2405  return 0;
2406  }
2407 
2408  cp->type = CP_pcdata;
2409  }
2410  else
2411  {
2412  if(parse_name(p, "in content declaration") < 0)
2413  return 0;
2414 
2415  if(!(cp = Malloc(sizeof(*cp))))
2416  {
2417  error(p, "System error");
2418  return 0;
2419  }
2420 
2421  cp->type = CP_name;
2422  CopyName0(cp->name);
2423  }
2424 
2425  if(looking_at(p, "*"))
2426  cp->repetition = '*';
2427  else if(looking_at(p, "+"))
2428  cp->repetition = '+';
2429  else if(looking_at(p, "?"))
2430  cp->repetition = '?';
2431  else
2432  cp->repetition = 0;
2433 
2434  return cp;
2435 }
2436 
2437 /* Called after '(' */
2438 
2439 static ContentParticle parse_choice_or_seq(Parser p)
2440 {
2441  ContentParticle cp, cp1;
2442 
2443 
2444  require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2445 
2446  if(!(cp1 = parse_cp(p)))
2447  return 0;
2448 
2449  require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2450 
2451  if(!(cp = parse_choice_or_seq_1(p, 1, 0)))
2452  FreeContentParticle(cp1);
2453  else
2454  cp->children[0] = cp1;
2455 
2456  return cp;
2457 }
2458 
2459 /* Called before '|', ',', or ')' */
2460 
2461 static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren, char sep)
2462 {
2463  ContentParticle cp = 0, cp1;
2464  int nsep = get(p->source);
2465 
2466  if(nsep == ')')
2467  {
2468  /* We've reached the end */
2469 
2470  if(!(cp = Malloc(sizeof(*cp))) ||
2471  !(cp->children = Malloc(nchildren * sizeof(cp))))
2472  {
2473  Free(cp);
2474  error(p, "System error");
2475  return 0;
2476  }
2477 
2478  /* The standard does not specify whether '(foo)' is a choice or a
2479  sequence. We make it a choice so that (#PCDATA) comes out as
2480  a choice, like other mixed models. */
2481 
2482  cp->type = sep == ',' ? CP_seq : CP_choice;
2483  cp->nchildren = nchildren;
2484 
2485  return cp;
2486  }
2487 
2488  if(nsep != '|' && nsep != ',')
2489  {
2490  error(p, "Expected | or , or ) in content declaration, got %s",
2491  escape(nsep));
2492  return 0;
2493  }
2494 
2495  if(sep && nsep != sep)
2496  {
2497  error(p, "Content particle contains both | and ,");
2498  return 0;
2499  }
2500 
2501  require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2502 
2503  if(!(cp1 = parse_cp(p)))
2504  return 0;
2505 
2506  require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2507 
2508  if(!(cp = parse_choice_or_seq_1(p, nchildren+1, (char)nsep)))
2509  FreeContentParticle(cp1);
2510  else
2511  cp->children[nchildren] = cp1;
2512 
2513  return cp;
2514 }
2515 
2516 /* Check content particle matches Mixed or children */
2517 
2518 static int check_content_decl(Parser p, ContentParticle cp)
2519 {
2520  int i;
2521 
2522  if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
2523  {
2524  for(i=1; i<cp->nchildren; i++)
2525  if(cp->children[i]->type != CP_name)
2526  return error(p, "Invalid mixed content declaration");
2527 
2528  if(cp->repetition != '*' &&
2529  !(cp->nchildren == 1 && cp->repetition == 0))
2530  return error(p, "Invalid mixed content declaration");
2531 
2532  return 0;
2533  }
2534  else
2535  return check_content_decl_1(p, cp);
2536 }
2537 
2538 static int check_content_decl_1(Parser p, ContentParticle cp)
2539 {
2540  int i;
2541 
2542  switch(cp->type)
2543  {
2544  case CP_pcdata:
2545  return error(p, "Misplaced #PCDATA in content declaration");
2546  case CP_seq:
2547  case CP_choice:
2548  for(i=0; i<cp->nchildren; i++)
2549  if(check_content_decl_1(p, cp->children[i]) < 0)
2550  return -1;
2551  return 0;
2552  default:
2553  return 0;
2554  }
2555 }
2556 
2557 /* Reconstruct the content model as a string */
2558 
2559 static Char *stringify_cp(ContentParticle cp)
2560 {
2561  int size = size_cp(cp);
2562  Char *s;
2563  FILE16 *f;
2564 
2565  if(!(s = Malloc((size+1) * sizeof(Char))) ||
2566  !(f = MakeFILE16FromString(s, (size + 1) * sizeof(Char), "w")))
2567  {
2568  Free(s);
2569  return 0;
2570  }
2571 
2572  print_cp(cp, f);
2573  s[size] = 0;
2574 
2575  Fclose(f);
2576 
2577  return s;
2578 }
2579 
2580 static void print_cp(ContentParticle cp, FILE16 *f)
2581 {
2582  int i;
2583 
2584  switch(cp->type)
2585  {
2586  case CP_pcdata:
2587  Fprintf(f, "#PCDATA");
2588  break;
2589  case CP_name:
2590  Fprintf(f, "%S", cp->name);
2591  break;
2592  case CP_seq:
2593  case CP_choice:
2594  Fprintf(f, "(");
2595  for(i=0; i<cp->nchildren; i++)
2596  {
2597  if(i != 0)
2598  Fprintf(f, cp->type == CP_seq ? "," : "|");
2599  print_cp(cp->children[i], f);
2600  }
2601  Fprintf(f, ")");
2602  break;
2603  }
2604 
2605  if(cp->repetition)
2606  Fprintf(f, "%c", cp->repetition);
2607 }
2608 
2609 static int size_cp(ContentParticle cp)
2610 {
2611  int i, s;
2612 
2613  switch(cp->type)
2614  {
2615  case CP_pcdata:
2616  s = 7;
2617  break;
2618  case CP_name:
2619  s = Strlen(cp->name);
2620  break;
2621  default:
2622  s = 2;
2623  for(i=0; i<cp->nchildren; i++)
2624  {
2625  if(i != 0)
2626  s++;
2627  s += size_cp(cp->children[i]);
2628  }
2629  break;
2630  }
2631 
2632  if(cp->repetition)
2633  s++;
2634 
2635  return s;
2636 }
2637 
2638 void FreeContentParticle(ContentParticle cp)
2639 {
2640  int i;
2641 
2642  if(!cp)
2643  return;
2644 
2645  switch(cp->type)
2646  {
2647  case CP_pcdata:
2648  break;
2649  case CP_name:
2650  Free(cp->name);
2651  break;
2652  case CP_seq:
2653  case CP_choice:
2654  for(i=0; i<cp->nchildren; i++)
2655  FreeContentParticle(cp->children[i]);
2656  Free(cp->children);
2657  break;
2658  }
2659 
2660  Free(cp);
2661 }
2662 
2663 /* Called after reading '<!ATTLIST ' */
2664 
2665 static int parse_attlist_decl(Parser p)
2666 {
2667  Char *name;
2668  ElementDefinition element;
2669  AttributeType type;
2670  DefaultType default_type;
2671  Char **allowed_values, *t;
2672  Char *default_value;
2673  int nvalues, i;
2674 
2675  require(parse_name(p, "for name in attlist declaration"));
2676  CopyName(name);
2677  maybe_uppercase(p, name);
2678 
2679  if(!(element = FindElement(p->dtd, name)))
2680  {
2681  if(!(element = TentativelyDefineElement(p->dtd, name)))
2682  return error(p, "System error");
2683  }
2684  Free(name);
2685 
2686  require(expect_dtd_whitespace(p,
2687  "after element name in attlist declaration"));
2688 
2689  while(!looking_at(p, ">"))
2690  {
2691  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2692  require(parse_name(p, "for attribute in attlist declaration"));
2693  CopyName(name);
2694  maybe_uppercase(p, name);
2695 
2696  require(expect_dtd_whitespace(p, "after name in attlist declaration"));
2697 
2698  if(looking_at(p, "CDATA"))
2699  type = AT_cdata;
2700  else if(looking_at(p, "IDREFS"))
2701  type = AT_idrefs;
2702  else if(looking_at(p, "IDREF"))
2703  type = AT_idref;
2704  else if(looking_at(p, "ID"))
2705  type = AT_id;
2706  else if(looking_at(p, "ENTITIES"))
2707  type = AT_entities;
2708  else if(looking_at(p, "ENTITY"))
2709  type = AT_entity;
2710  else if(looking_at(p, "NMTOKENS"))
2711  type = AT_nmtokens;
2712  else if(looking_at(p, "NMTOKEN"))
2713  type = AT_nmtoken;
2714  else if(looking_at(p, "NOTATION"))
2715  type = AT_notation;
2716  else
2717  type = AT_enumeration;
2718 
2719  if(type != AT_enumeration)
2720  {
2721  require(expect_dtd_whitespace(p, "after attribute type"));
2722  }
2723 
2724  if(type == AT_notation || type == AT_enumeration)
2725  {
2726  require(expect(p, '(',
2727  "or keyword for type in attlist declaration"));
2728 
2729  nvalues = 0;
2730  p->pbufnext = 0;
2731  do
2732  {
2733  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2734  if(type == AT_notation)
2735  require(parse_name(p,
2736  "for notation value in attlist declaration"));
2737  else
2738  require(parse_nmtoken(p,
2739  "for enumerated value in attlist declaration"));
2740  maybe_uppercase_name(p);
2741  ExpandBuf(p->pbuf, p->pbufnext + p->namelen + 1);
2742  memcpy(p->pbuf+p->pbufnext,
2743  p->name,
2744  p->namelen * sizeof(Char));
2745  p->pbuf[p->pbufnext + p->namelen] = 0;
2746  p->pbufnext += (p->namelen + 1);
2747  nvalues++;
2748  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2749  }
2750  while(looking_at(p, "|"));
2751 
2752  require(expect(p, ')',
2753  "at end of enumerated value list in attlist declaration"));
2754  require(expect_dtd_whitespace(p, "after enumerated value list "
2755  "in attlist declaration"));
2756 
2757  allowed_values = Malloc((nvalues+1)*sizeof(Char *));
2758  if(!allowed_values)
2759  return error(p, "System error");
2760  for(i=0, t=p->pbuf; i<nvalues; i++)
2761  {
2762  allowed_values[i] = t;
2763  while(*t++)
2764  ;
2765  }
2766  allowed_values[nvalues] = 0;
2767 
2768  Consume(p->pbuf);
2769  }
2770  else
2771  allowed_values = 0;
2772 
2773  if(looking_at(p, "#REQUIRED"))
2774  default_type = DT_required;
2775  else if(looking_at(p, "#IMPLIED"))
2776  default_type = DT_implied;
2777  else if(looking_at(p, "#FIXED"))
2778  {
2779  default_type = DT_fixed;
2780  require(expect_dtd_whitespace(p, "after #FIXED"));
2781  }
2782  else
2783  default_type = DT_none;
2784 
2785  if(default_type == DT_fixed || default_type == DT_none)
2786  {
2787  require(parse_string(p,
2788  "for default value in attlist declaration",
2789  type == AT_cdata ? LT_cdata_attr :
2790  LT_tok_attr));
2791  default_value = p->pbuf;
2792  Consume(p->pbuf);
2793  if(type != AT_cdata && type != AT_entity && type != AT_entities)
2794  maybe_uppercase(p, default_value);
2795  }
2796  else
2797  default_value = 0;
2798 
2799  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2800 
2801  if(FindAttribute(element, name))
2802  {
2803  if(ParserGetFlag(p, WarnOnRedefinitions))
2804  warn(p, "Ignoring redeclaration of attribute %S", name);
2805  if(allowed_values)
2806  {
2807  Free(allowed_values[0]);
2808  Free(allowed_values);
2809  }
2810  if(default_value)
2811  Free(default_value);
2812  }
2813  else
2814  if(!DefineAttribute(element, name, type, allowed_values,
2815  default_type, default_value))
2816  return error(p, "System error");
2817 
2818  Free(name);
2819  }
2820 
2821  return 0;
2822 }
2823 
2824 /* Used for external dtd part, entity definitions and notation definitions. */
2825 /* NB PE references are not allowed here (why not?) */
2826 
2827 static int parse_external_id(Parser p, int required,
2828  char8 **publicid, char8 **systemid,
2829  int preq, int sreq)
2830 {
2831  InputSource s = p->source;
2832  int c;
2833  Char *cp;
2834 
2835  *publicid = 0;
2836  *systemid = 0;
2837 
2838  if(looking_at(p, "SYSTEM"))
2839  {
2840  if(!sreq)
2841  {
2842  skip_whitespace(s);
2843  c = get(s); unget(s);
2844  if(c != '"' && c != '\'')
2845  return 0;
2846  }
2847  else
2848  require(expect_dtd_whitespace(p, "after SYSTEM"));
2849 
2850  require(parse_string(p, "for system ID", LT_plain));
2851  if(!(*systemid = strdup8(Chartochar8(p->pbuf))))
2852  return error(p, "System error");
2853  }
2854  else if(looking_at(p, "PUBLIC"))
2855  {
2856  if(!preq && !sreq)
2857  {
2858  skip_whitespace(s);
2859  c = get(s); unget(s);
2860  if(c != '"' && c != '\'')
2861  return 0;
2862  }
2863  else
2864  require(expect_dtd_whitespace(p, "after PUBLIC"));
2865 
2866  require(parse_string(p, "for public ID", LT_plain));
2867 
2868  for(cp=p->pbuf; *cp; cp++)
2869  if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
2870  strchr8("-'()+,./:=?;!*#@$_% \r\n", *cp) == 0)
2871  return error(p, "Illegal character %s in public id",
2872  escape(*cp));
2873 
2874  if(!(*publicid = strdup8(Chartochar8(p->pbuf))))
2875  return error(p, "System error");
2876 
2877  if(!sreq)
2878  {
2879  skip_whitespace(s);
2880  c = get(s); unget(s);
2881  if(c != '"' && c != '\'')
2882  return 0;
2883  }
2884  else
2885  require(expect_dtd_whitespace(p, "after public id"));
2886 
2887  require(parse_string(p, "for system ID", LT_plain));
2888  if(!(*systemid = strdup8(Chartochar8(p->pbuf))))
2889  return error(p, "System error");
2890  }
2891  else if(required)
2892  return error(p, "Missing or invalid external ID");
2893 
2894  return 0;
2895 }
2896 
2897 /* Called after reading '<!ENTITY ' */
2898 
2899 static int parse_entity_decl(Parser p, Entity ent, int line, int chpos)
2900 {
2901  Entity e, old;
2902  int pe, t;
2903  Char *name;
2904 
2905  pe = looking_at(p, "%"); /* If it were a PE ref, we would
2906  already have pushed it */
2907 
2908  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2909  require(parse_name(p, "for name in entity declaration"));
2910  CopyName(name);
2911 
2912  require(expect_dtd_whitespace(p, "after name in entity declaration"));
2913 
2914  if(looking_at(p, "'") || looking_at(p, "\""))
2915  {
2916  Char *value;
2917 
2918  unget(p->source);
2919  require(parse_string(p, "for value in entity declaration", LT_entity));
2920  value = p->pbuf;
2921  Consume(p->pbuf);
2922 
2923  if(!(e = NewInternalEntity(name, value, ent, line, chpos, 0)))
2924  return error(p, "System error");
2925  }
2926  else
2927  {
2928  char8 *publicid, *systemid;
2929  NotationDefinition notation = 0;
2930 
2931  require(parse_external_id(p, 1, &publicid, &systemid, 1, 1));
2932 
2933  require((t = skip_dtd_whitespace(p, p->external_pe_depth > 0)));
2934  if(looking_at(p, "NDATA"))
2935  {
2936  if(t == 0)
2937  return error(p, "Whitespace missing before NDATA");
2938  if(pe)
2939  return error(p, "NDATA not allowed for parameter entity");
2940  require(expect_dtd_whitespace(p, "after NDATA"));
2941  require(parse_name(p, "for notation name in entity declaration"));
2942  maybe_uppercase_name(p);
2943  notation = FindNotationN(p->dtd, p->name, p->namelen);
2944  if(!notation)
2945  {
2946  notation =
2947  TentativelyDefineNotationN(p->dtd, p->name, p->namelen);
2948  if(!notation)
2949  return error(p, "System error");
2950  }
2951  }
2952 
2953  if(!(e = NewExternalEntity(name, publicid, systemid, notation, ent)))
2954  return error(p, "System error");
2955  }
2956 
2957  Free(name);
2958 
2959  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2960  require(expect(p, '>', "at end of entity declaration"));
2961 
2962  if((old = FindEntity(p->dtd, e->name, pe)) &&
2963  old->parent != xml_builtin_entity)
2964  {
2965  if(ParserGetFlag(p, WarnOnRedefinitions))
2966  warn(p, "Ignoring redefinition of%s entity %S",
2967  pe ? " parameter" : "", e->name);
2968  }
2969  else
2970  if(!DefineEntity(p->dtd, e, pe))
2971  return error(p, "System error");
2972 
2973  return 0;
2974 }
2975 
2976 /* Called after reading '<!NOTATION ' */
2977 
2978 static int parse_notation_decl(Parser p)
2979 {
2980  Char *name;
2981  char8 *publicid, *systemid;
2982  NotationDefinition def;
2983 
2984  require(parse_name(p, "for name in notation declaration"));
2985  CopyName(name);
2986  maybe_uppercase(p, name);
2987 
2988  require(expect_dtd_whitespace(p, "after name in notation declaration"));
2989 
2990  require(parse_external_id(p, 1, &publicid, &systemid, 1, 0));
2991 
2992  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2993  require(expect(p, '>', "at end of notation declaration"));
2994 
2995  if((def = FindNotation(p->dtd, name)))
2996  {
2997  if(def->tentative)
2998  RedefineNotation(def, publicid, systemid);
2999  else
3000  if(ParserGetFlag(p, WarnOnRedefinitions))
3001  {
3002  warn(p, "Ignoring redefinition of notation %S", name);
3003  if(publicid) Free(publicid);
3004  if(systemid) Free(systemid);
3005  }
3006  }
3007  else
3008  {
3009  if(!DefineNotation(p->dtd, name, publicid, systemid))
3010  return error(p, "System error");
3011  }
3012 
3013  Free(name);
3014 
3015  return 0;
3016 }
3017 
3018 static int parse_conditional(Parser p)
3019 {
3020  int depth=1;
3021 
3022  if(p->external_pe_depth == 0)
3023  return error(p, "Conditional section not allowed in internal subset");
3024 
3025  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3026  if(looking_at(p, "INCLUDE"))
3027  {
3028  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3029  require(expect(p, '[', "at start of conditional section"));
3030  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3031  while(!looking_at(p, "]"))
3032  {
3033  switch(parse_markupdecl(p))
3034  {
3035  case 1:
3036  return error(p, "EOF in conditional section");
3037  case -1:
3038  return -1;
3039  }
3040  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3041  }
3042 
3043  if(!looking_at(p, "]>"))
3044  return error(p, "]> required after ] in conditional section");
3045  }
3046  else if(looking_at(p, "IGNORE"))
3047  {
3048  /* Easy, because ]]> not even allowed in strings! */
3049 
3050  require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3051  require(expect(p, '[', "at start of conditional section"));
3052 
3053  while(depth > 0)
3054  {
3055  switch(get(p->source))
3056  {
3057  case XEOE:
3058  if(p->source->parent)
3059  ParserPop(p);
3060  else
3061  return error(p, "EOE in ignored conditional section");
3062  break;
3063  case '<':
3064  if(looking_at(p, "!["))
3065  depth++;
3066  break;
3067  case ']':
3068  if(looking_at(p, "]>"))
3069  depth--;
3070  }
3071  }
3072  }
3073  else
3074  return error(p, "INCLUDE or IGNORE required in conditional section");
3075 
3076  return 0;
3077 }
3078 
3079 static void maybe_uppercase(Parser p, Char *s)
3080 {
3081  if(ParserGetFlag(p, CaseInsensitive))
3082  while(*s)
3083  {
3084  *s = Toupper(*s);
3085  s++;
3086  }
3087 }
3088 
3089 static void maybe_uppercase_name(Parser p)
3090 {
3091  int i;
3092 
3093  if(ParserGetFlag(p, CaseInsensitive))
3094  for(i=0; i<p->namelen; i++)
3095  p->name[i] = Toupper(p->name[i]);
3096 }
3097 
3098 static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b)
3099 {
3100  return
3101  ParserGetFlag(p, CaseInsensitive) ? strcasecmp8(a, b) : strcmp8(a, b);
3102 }
3103 
3104 static int is_ascii_alpha(int c)
3105 {
3106  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
3107 }
3108 
3109 static int is_ascii_digit(int c)
3110 {
3111  return c >= '0' && c <= '9';
3112 }
3113 
3114 /* Error handling */
3115 
3116 static void verror(XBit bit, const char8 *format, va_list args)
3117 {
3118  /* yuk, but we don't want to fail if we can't allocate */
3119  static char8 message[400];
3120 
3121  /* Print message before freeing xbit, so we can print data from it */
3122  Vsprintf(message, CE_ISO_8859_1, format, args);
3123 
3124  FreeXBit(bit);
3125  bit->type = XBIT_error;
3126  bit->error_message = message;
3127 }
3128 
3129 static int error(Parser p, const char8 *format, ...)
3130 {
3131  va_list args;
3132 
3133  va_start(args, format);
3134  verror(&p->xbit, format, args);
3135 
3136  p->state = PS_error;
3137 
3138  return -1;
3139 }
3140 
3141 static void warn(Parser p, const char8 *format, ...)
3142 {
3143  va_list args;
3144  static struct xbit bit;
3145 
3146  va_start(args, format);
3147  verror(&bit, format, args);
3148 
3149  bit.type = XBIT_warning;
3150 
3151  if(p->warning_callback)
3152  p->warning_callback(&bit, p->callback_arg);
3153  else
3154  ParserPerror(p, &bit);
3155 }
3156