Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
input.c
1 /*************************************************************************/
2 /* */
3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4 /* University of Edinburgh. */
5 /* */
6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13 /* */
14 /*************************************************************************/
15 /*
16  * This code is in a distressed state due to hackery for windoze.
17  * See comment in url.c.
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <assert.h>
23 
24 #ifdef FOR_LT
25 
26 #include "lt-memory.h"
27 #include "nsllib.h"
28 
29 #define ERR(m) LT_ERROR(NECHAR,m)
30 #define ERR1(m,x) LT_ERROR1(NECHAR,m,x)
31 #define ERR2(m,x,y) LT_ERROR2(NECHAR,m,x,y)
32 #define ERR3(m,x,y,z) LT_ERROR3(NECHAR,m,x,y,z)
33 
34 #define Malloc salloc
35 #define Realloc srealloc
36 #define Free sfree
37 
38 #else
39 
40 #include "system.h"
41 #define ERR(m) fprintf(stderr,m)
42 #define ERR1(m,x) fprintf(stderr,m,x)
43 #define ERR2(m,x,y) fprintf(stderr,m,x,y)
44 #define ERR3(m,x,y,z) fprintf(stderr,m,x,y,z)
45 
46 #endif
47 
48 #include "charset.h"
49 #include "string16.h"
50 #include "dtd.h"
51 #include "input.h"
52 #include "url.h"
53 #include "ctype16.h"
54 
55 static int get_translated_line1(InputSource s);
56 
57 InputSource SourceFromStream(const char8 *description, FILE *file)
58 {
59  Entity e;
60 
61  e = NewExternalEntity(0, 0, description, 0, 0);
62  if(!strchr8(description, '/'))
63  EntitySetBaseURL(e, default_base_url());
64 
65  return NewInputSource(e, MakeFILE16FromFILE(file, "r"));
66 }
67 
68 InputSource EntityOpen(Entity e)
69 {
70  FILE16 *f16;
71 
72  if(e->type == ET_external)
73  {
74  const char8 *url = EntityURL(e);
75 
76  if(!url || !(f16 = url_open(url, 0, "r", 0)))
77  return 0;
78  }
79  else
80  {
81  f16 = MakeFILE16FromString((char *)e->text, -1, "r");
82  }
83 
84  return NewInputSource(e, f16);
85 }
86 
87 
88 InputSource NewInputSource(Entity e, FILE16 *f16)
89 {
90  InputSource source;
91 
92  if(!(source = Malloc(sizeof(*source))))
93  return 0;
94 
95  source->line = 0;
96  source->line_alloc = 0;
97  source->line_length = 0;
98  source->next = 0;
99  source->seen_eoe = 0;
100 
101  source->entity = e;
102 
103  source->file16 = f16;
104 
105  source->bytes_consumed = 0;
106  source->bytes_before_current_line = 0;
107  source->line_end_was_cr = 0;
108  source->line_number = 0;
109  source->not_read_yet = 1;
110 
111  source->nextin = source->insize = 0;
112 
113  source->parent = 0;
114 
115  return source;
116 }
117 
118 int SourceLineAndChar(InputSource s, int *linenum, int *charnum)
119 {
120  Entity e = s->entity, f = e->parent;
121 
122  if(e->type == ET_external)
123  {
124  *linenum = s->line_number;
125  *charnum = s->next;
126  return 1;
127  }
128 
129  if(f && f->type == ET_external)
130  {
131  if(e->matches_parent_text)
132  {
133  *linenum = e->line_offset + s->line_number;
134  *charnum = (s->line_number == 0 ? e->line1_char_offset : 0) +
135  s->next;
136  return 1;
137  }
138  else
139  {
140  *linenum = e->line_offset;
141  *charnum = e->line1_char_offset;
142  return 0;
143  }
144  }
145 
146  if(f && f->matches_parent_text)
147  {
148  *linenum = f->line_offset + e->line_offset;
149  *charnum = (e->line_offset == 0 ? f->line1_char_offset : 0) +
150  e->line1_char_offset;
151  return 0;
152  }
153 
154  return -1;
155 }
156 
157 void SourcePosition(InputSource s, Entity *entity, int *byte_offset)
158 {
159  *entity = s->entity;
160  *byte_offset = SourceTell(s);
161 }
162 
163 int SourceTell(InputSource s)
164 {
165 #if CHAR_SIZE == 8
166  return s->bytes_before_current_line + s->next;
167 #else
168  switch(s->entity->encoding)
169  {
170  case CE_ISO_10646_UCS_2B:
171  case CE_UTF_16B:
172  case CE_ISO_10646_UCS_2L:
173  case CE_UTF_16L:
174  return s->bytes_before_current_line + 2 * s->next;
175  case CE_ISO_8859_1:
176  case CE_ISO_8859_2:
177  case CE_ISO_8859_3:
178  case CE_ISO_8859_4:
179  case CE_ISO_8859_5:
180  case CE_ISO_8859_6:
181  case CE_ISO_8859_7:
182  case CE_ISO_8859_8:
183  case CE_ISO_8859_9:
184  case CE_unspecified_ascii_superset:
185  return s->bytes_before_current_line + s->next;
186  case CE_UTF_8:
187  if(s->complicated_utf8_line)
188  {
189  /* examine earlier chars in line to see how many bytes they used */
190  int i, c, n=0;
191  for(i = 0; i < s->next; i++)
192  {
193  c = s->line[i];
194  if(c <= 0x7f)
195  n += 1;
196  else if(c <= 0x7ff)
197  n += 2;
198  else if(c >= 0xd800 && c <= 0xdfff)
199  /* One of a surrogate pair, count 2 each */
200  n += 2;
201  else if(c <= 0xffff)
202  n += 3;
203  else if(c <= 0x1ffff)
204  n += 4;
205  else if(c <= 0x3ffffff)
206  n += 5;
207  else
208  n += 6;
209 
210  }
211  return s->bytes_before_current_line + n;
212  }
213  else
214  return s->bytes_before_current_line + s->next;
215  default:
216  return -1;
217  }
218 #endif
219 }
220 
221 int SourceSeek(InputSource s, int offset)
222 {
223  s->line_length = 0;
224  s->next = 0;
225  s->seen_eoe = 0;
226  s->bytes_consumed = s->bytes_before_current_line = offset;
227  s->nextin = s->insize = 0;
228  /* XXX line number will be wrong! */
229  s->line_number = -999999;
230  return Fseek(s->file16, offset, SEEK_SET);
231 }
232 
233 static int get_translated_line(InputSource s)
234 {
235  /* This is a hack, pending some reorganisation */
236 
237  struct _FILE16 {
238  void *handle;
239  int handle2, handle3;
240  /* we don't need the rest here */
241  };
242 
243  Entity e = s->entity;
244  Char *p;
245  struct _FILE16 *f16 = (struct _FILE16 *)s->file16;
246 
247 
248  if(e->type == ET_external)
249  return get_translated_line1(s);
250 
251  if(!*(Char *)((char *)f16->handle + f16->handle2))
252  {
253  s->line_length = 0;
254  return 0;
255  }
256 
257  s->line = (Char *)((char *)f16->handle + f16->handle2);
258  for(p=s->line; *p && *p != '\n'; p++)
259  ;
260  if(*p)
261  p++;
262  f16->handle2 = (char *)p - (char *)f16->handle;
263  s->line_length = p - s->line;
264 
265  s->bytes_before_current_line = f16->handle2;
266 
267  return 0;
268 }
269 
270 static int get_translated_line1(InputSource s)
271 {
272  unsigned int c; /* can't use Char, it might be >0x10000 */
273  unsigned char *inbuf = s->inbuf;
274  int nextin = s->nextin, insize = s->insize;
275  int startin = s->nextin;
276  Char *outbuf = s->line;
277  int outsize = s->line_alloc;
278  int nextout = 0;
279  int remaining = 0;
280  int ignore_linefeed = s->line_end_was_cr;
281 
282 #if CHAR_SIZE == 16
283 
284  int *to_unicode = 0; /* initialize to shut gcc up */
285  CharacterEncoding enc = s->entity->encoding;
286  int more, i;
287  s->complicated_utf8_line = 0;
288 
289  if(enc >= CE_ISO_8859_2 && enc <= CE_ISO_8859_9)
290  to_unicode = iso_to_unicode[enc - CE_ISO_8859_2];
291 
292 #endif
293 
294  s->line_end_was_cr = 0;
295  s->bytes_before_current_line = s->bytes_consumed;
296 
297  while(1)
298  {
299  /* There are never more characters than bytes in the input */
300  if(outsize < nextout + (insize - nextin))
301  {
302  outsize = nextout + (insize - nextin);
303  outbuf = Realloc(outbuf, outsize * sizeof(Char));
304  }
305 
306  while(nextin < insize)
307  {
308 #if CHAR_SIZE == 8
309  c = inbuf[nextin++];
310 #else
311  switch(enc)
312  {
313  case CE_ISO_10646_UCS_2B:
314  case CE_UTF_16B:
315  if(nextin+2 > insize)
316  goto more_bytes;
317  c = (inbuf[nextin] << 8) + inbuf[nextin+1];
318  nextin += 2;
319  break;
320  case CE_ISO_10646_UCS_2L:
321  case CE_UTF_16L:
322  if(nextin+2 > insize)
323  goto more_bytes;
324  c = (inbuf[nextin+1] << 8) + inbuf[nextin];
325  nextin += 2;
326  break;
327  case CE_ISO_8859_1:
328  case CE_unspecified_ascii_superset:
329  c = inbuf[nextin++];
330  break;
331  case CE_ISO_8859_2:
332  case CE_ISO_8859_3:
333  case CE_ISO_8859_4:
334  case CE_ISO_8859_5:
335  case CE_ISO_8859_6:
336  case CE_ISO_8859_7:
337  case CE_ISO_8859_8:
338  case CE_ISO_8859_9:
339  c = to_unicode[inbuf[nextin++]];
340  if(c == (unsigned int)-1)
341  ERR3("Illegal %s character <0x%x> "
342  "at file offset %d\n",
343  CharacterEncodingName[enc], inbuf[nextin-1],
344  s->bytes_consumed + nextin - 1 - startin);
345  break;
346  case CE_UTF_8:
347  c = inbuf[nextin++];
348  if(c <= 0x7f)
349  break;
350  if(c <= 0xc0 || c >= 0xfe)
351  {
352  ERR2("Illegal UTF-8 start byte <0x%x> "
353  "at file offset %d\n",
354  c, s->bytes_consumed + nextin - 1 - startin);
355  return -1;
356  }
357  if(c <= 0xdf)
358  {
359  c &= 0x1f;
360  more = 1;
361  }
362  else if(c <= 0xef)
363  {
364  c &= 0x0f;
365  more = 2;
366  }
367  else if(c <= 0xf7)
368  {
369  c &= 0x07;
370  more = 3;
371  }
372  else if(c <= 0xfb)
373  {
374  c &= 0x03;
375  more = 4;
376  }
377  else
378  {
379  c &= 0x01;
380  more = 5;
381  }
382  if(nextin+more > insize)
383  {
384  nextin--;
385  goto more_bytes;
386  }
387  s->complicated_utf8_line = 1;
388  for(i=0; i<more; i++)
389  c = (c << 6) + (inbuf[nextin++] & 0x3f);
390  break;
391  default:
392  ERR("read from entity with unsupported encoding!\n");
393  return -1;
394  }
395 
396  if(c > 0x110000 || (c < 0x10000 && !is_xml_legal(c)))
397  if(!(enc == CE_UTF_16L || enc == CE_UTF_16B) ||
398  c < 0xd800 || c > 0xdfff)
399  /* We treat the surrogates as legal because we didn't
400  combine them when translating from UTF-16. XXX */
401  {
402  ERR2("Error: illegal character <0x%x> "
403  "immediately before file offset %d\n",
404  c, s->bytes_consumed + nextin - startin);
405  return -1;
406  }
407 #endif
408  if(c == '\n' && ignore_linefeed)
409  {
410  /* Ignore lf at start of line if last line ended with cr */
411  ignore_linefeed = 0;
412  s->bytes_before_current_line += (nextin - startin);
413  }
414  else
415  {
416  ignore_linefeed = 0;
417  if(c == '\r')
418  {
419  s->line_end_was_cr = 1;
420  c = '\n';
421  }
422 
423 #if CHAR_SIZE == 16
424  if(c >= 0x10000)
425  {
426  /* Use surrogates */
427  outbuf[nextout++] = ((c - 0x10000) >> 10) + 0xd800;
428  outbuf[nextout++] = ((c - 0x10000) & 0x3ff) + 0xdc00;
429  }
430  else
431  outbuf[nextout++] = c;
432 #else
433  outbuf[nextout++] = c;
434 #endif
435 
436  if(c == '\n')
437  {
438  s->nextin = nextin;
439  s->insize = insize;
440  s->bytes_consumed += (nextin - startin);
441  s->line = outbuf;
442  s->line_alloc = outsize;
443  s->line_length = nextout;
444  return 0;
445  }
446  }
447  }
448 
449 #if CHAR_SIZE == 16
450  more_bytes:
451  /* Copy down any partial character */
452 
453  remaining = insize - nextin;
454  for(i=0; i<remaining; i++)
455  inbuf[i] = inbuf[nextin + i];
456 #endif
457 
458  /* Get another block */
459 
460  s->bytes_consumed += (nextin - startin);
461 
462  insize = Readu(s->file16,
463  inbuf+insize-nextin, sizeof(s->inbuf)-remaining);
464  nextin = startin = 0;
465 
466  if(insize <= 0)
467  {
468  s->nextin = nextin;
469  s->insize = 0;
470  s->line = outbuf;
471  s->line_alloc = outsize;
472  s->line_length = nextout;
473  return insize;
474  }
475 
476  insize += remaining;
477  }
478 }
479 
480 void determine_character_encoding(InputSource s)
481 {
482  Entity e = s->entity;
483  int nread;
484  unsigned char *b = (unsigned char *)s->inbuf;
485 
486  b[0] = b[1] = b[2] = b[3] = 0;
487 
488  while(s->insize < 4)
489  {
490  nread = Readu(s->file16, s->inbuf + s->insize, 4 - s->insize);
491  if(nread == -1)
492  return;
493  if(nread == 0)
494  break;
495  s->insize += nread;
496  }
497 
498 #if 0
499  if(b[0] == 0 && b[1] == 0 && b[2] == 0 && b[3] == '<')
500  e->encoding = CE_ISO_10646_UCS_4B;
501  else if(b[0] == '<' && b[1] == 0 && b[2] == 0 && b[3] == 0)
502  e->encoding = CE_ISO_10646_UCS_4L;
503  else
504 #endif
505  if(b[0] == 0xfe && b[1] == 0xff)
506  {
507  e->encoding = CE_UTF_16B;
508  s->nextin = 2;
509  }
510  else if(b[0] == 0 && b[1] == '<' && b[2] == 0 && b[3] == '?')
511  e->encoding = CE_UTF_16B;
512  else if(b[0] == 0xff && b[1] == 0xfe)
513  {
514  e->encoding = CE_UTF_16L;
515  s->nextin = 2;
516  }
517  else if(b[0] == '<' && b[1] == 0 && b[2] == '?' && b[3] == 0)
518  e->encoding = CE_UTF_16L;
519  else
520  {
521 #if CHAR_SIZE == 8
522  e->encoding = CE_unspecified_ascii_superset;
523 #else
524  e->encoding = CE_UTF_8;
525 #endif
526  }
527 }
528 
529 int get_with_fill(InputSource s)
530 {
531  assert(!s->seen_eoe);
532 
533  if(get_translated_line(s) != 0)
534  {
535  /* It would be nice to pass this up to the parser, but we don't
536  know anything about parsers here! */
537  ERR1("I/O error on stream <%s>, ignore further errors\n",
538  EntityDescription(s->entity));
539 
540  /* Restore old line and return EOE (is this the best thing to do?) */
541  s->line_length = s->next;
542  s->seen_eoe = 1;
543  return XEOE;
544  }
545 
546  if(s->line_length == 0)
547  {
548  /* Restore old line */
549  s->line_length = s->next;
550  s->seen_eoe = 1;
551  return XEOE;
552  }
553 
554  s->next = 0;
555 
556  if(s->not_read_yet)
557  s->not_read_yet = 0;
558  else
559  s->line_number++;
560 
561  return s->line[s->next++];
562 }