Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
url.c
1 /*************************************************************************/
2 /* */
3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4 /* University of Edinburgh. */
5 /* */
6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13 /* */
14 /*************************************************************************/
15 #ifdef FOR_LT
16 
17 #include "lt-defs.h"
18 #include "lt-memory.h"
19 #include "lt-errmsg.h"
20 #include "lt-comment.h"
21 #include "lt-safe.h"
22 #include "nsl-err.h"
23 
24 #define Strerror() strErr()
25 #define Malloc salloc
26 #define Realloc srealloc
27 #define Free sfree
28 #define fopen stdsfopen
29 
30 #else
31 
32 #include "system.h"
33 
34 #define LT_ERROR(err, format) fprintf(stderr, format)
35 #define LT_ERROR1(err, format, arg) fprintf(stderr, format, arg)
36 #define LT_ERROR2(err, format, arg1, arg2) fprintf(stderr, format, arg1, arg2)
37 #define LT_ERROR3(err, format, arg1, arg2, arg3) fprintf(stderr, format, arg1, arg2, arg3)
38 #define WARN(err, format) fprintf(stderr, format)
39 #define WARN1(err, format, arg) fprintf(stderr, format, arg)
40 
41 #define Strerror() strerror(errno)
42 
43 #ifdef MAXPATHLEN
44 #define CWDBS MAXPATHLEN+1
45 #else
46 #define CWDBS 1025
47 #endif
48 
49 #define GETWD(buf) getcwd(buf,CWDBS)
50 
51 #endif /* FOR_LT */
52 
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <assert.h>
57 #include <errno.h>
58 #include <string.h> /* that's where strerror is. really. */
59 #include <sys/types.h>
60 
61 #ifdef WIN32
62 #include <direct.h>
63 #endif
64 
65 #ifdef SOCKETS_IMPLEMENTED
66 
67 #ifdef WIN32
68 #undef boolean
69 #include <winsock.h>
70 #include <fcntl.h>
71 #else
72 #include <unistd.h>
73 #include <netdb.h>
74 #include <sys/socket.h>
75 #include <netinet/in.h>
76 #endif
77 
78 #endif
79 
80 #include "string16.h"
81 #include "stdio16.h"
82 #include "url.h"
83 
84 #ifdef HAVE_LIBZ
85 #include "zlib.h"
86 #ifdef macintosh
87 #include <fcntl.h>
88 #include <unix.h>
89 #endif
90 #endif
91 
92 static FILE16 *http_open(const char *url,
93  const char *host, int port, const char *path,
94  const char *type);
95 static FILE16 *file_open(const char *url,
96  const char *host, int port, const char *path,
97  const char *type);
98 
99 static void parse_url(const char *url,
100  char **scheme, char **host, int *port, char **path);
101 
102 /* Mapping of scheme names to opening functions */
103 
104 struct {
105  char *scheme;
106  FILE16 *(*open)(const char *, const char *, int, const char *, const char *);
107 } schemes[] = {
108  {(char *)"http", http_open},
109  {(char *)"file", file_open},
110 };
111 #define NSCHEME (sizeof(schemes) / sizeof(schemes[0]))
112 
113 /* Construct a default base URL, essentially file:`pwd`/ */
114 
115 char *default_base_url(void)
116 {
117  char buf[CWDBS];
118  char *url;
119 
120  if(!GETWD(buf))
121  {
122  WARN(LEFILE, "Warning: can't get current directory for default base url\n");
123  return strdup8("file:/");
124  }
125 
126 
127 #ifdef WIN32
128 
129  /* DOS: translate C:\a\b to file:/C:/a/b/ */
130  /* XXX should we escape anything? */
131  {
132  char *p;
133  for(p=buf; *p; p++)
134  if(*p == '\\')
135  *p = '/';
136  }
137  url = Malloc(6 + strlen(buf) + 2);
138  sprintf(url, "file:/%s/", buf);
139 
140 #else
141 #ifdef mac_filenames
142 
143  /* Mac: translate a:b to file:/a/b/ */
144  /* XXX should escape spaces and slashes, at least */
145  {
146  char *p;
147  for(p=buf; *p; p++)
148  if(*p == ':')
149  *p = '/';
150  /* Mac getcwd (always?) has a trailing separator, which we here bash */
151  if(*--p == '/')
152  *p = 0;
153  }
154  url = Malloc(6 + strlen(buf) + 2);
155  sprintf(url, "file:/%s/", buf);
156 
157 #else
158 
159  /* Unix: translate /a/b to file:/a/b/ */
160 
161  url = Malloc(5 + strlen(buf) + 2);
162  sprintf(url, "file:%s/", buf);
163 
164 #endif
165 #endif
166 
167  return url;
168 }
169 
170 /*
171  * Merge a URL with a base URL if necessary.
172  * The merged URL is returned.
173  * The parts of the URL are returned in scheme, host, port and path
174  * if these are non-null.
175  * Caller should free the results.
176  */
177 
178 char *url_merge(const char *url, const char *base,
179  char **_scheme, char **_host, int *_port, char **_path)
180 {
181  char *merged_scheme, *merged_host, *merged_path, *merged_url;
182  char *scheme=0, *host=0, *path=0;
183  char *base_scheme=0, *base_host=0, *base_path=0;
184  char *default_base=0;
185  int port, base_port, merged_port, i, j;
186  char *p;
187 
188  /* First see if we have an absolute URL */
189 
190  parse_url(url, &scheme, &host, &port, &path);
191  if(scheme && (host || *path == '/'))
192  {
193  merged_scheme = scheme;
194  merged_host = host;
195  merged_port = port;
196  merged_path = path;
197  merged_url = strdup8(url);
198  goto ok;
199  }
200 
201  /* Relative URL, so we need the base URL */
202 
203  if(!base)
204  base = default_base = default_base_url();
205 
206  parse_url(base, &base_scheme, &base_host, &base_port, &base_path);
207  if(base_scheme && (base_host || *base_path == '/'))
208  ;
209  else
210  {
211  LT_ERROR1(LEFILE, "Error: bad base URL <%s>\n", base);
212  goto bad;
213  }
214 
215  /* Determine merged path */
216 
217  if(path[0] == '/')
218  {
219  /* not relative, use as-is */
220  merged_path = path;
221  path = 0;
222  }
223  else
224  {
225  /* relative, append to base path */
226 
227  merged_path = Malloc(strlen(base_path) + strlen(path) + 1);
228  strcpy(merged_path, base_path);
229 
230  /* strip last component of base */
231 
232  for(i=strlen(merged_path)-1; i>=0 && merged_path[i] != '/'; i--)
233  merged_path[i] = '\0';
234 
235  /* append relative path */
236 
237  strcat(merged_path, path);
238 
239  /* Remove . and .. components from path */
240 
241  p = merged_path;
242  for(i=0; p[i]; )
243  {
244  assert(p[i] == '/');
245 
246  /* find next segment */
247 
248  for(j=i+1; p[j] && p[j] != '/'; j++)
249  ;
250 
251  /* Do we have "." ? */
252 
253  if(j - i == 2 && p[i+1] == '.')
254  {
255  strcpy(&p[i+1], p[j] ? &p[j+1] : &p[j]);
256  continue;
257  }
258 
259  /* Do we have "<segment>/.." with <segment> != ".." ? */
260 
261  /* (We know we're not looking at "./" so we don't have to
262  * worry about "./..")
263  */
264 
265  if(p[j] == '/' && p[j+1] == '.' && p[j+2] == '.' &&
266  (p[j+3] == '/' || p[j+3] == '\0') &&
267  (j - i != 3 || p[i+1] != '.' || p[i+2] != '.'))
268  {
269  strcpy(&p[i+1], p[j+3] ? &p[j+4] : &p[j+3]);
270  i = 0; /* start again from beginning */
271  continue;
272  }
273 
274  /* move to next segment */
275 
276  i = j;
277  }
278  }
279 
280  /* Check for deviant relative URLs like file:foo */
281 
282  if(scheme && !host && *path != '/')
283  {
284  if(strcmp(scheme, base_scheme) == 0)
285  {
286  WARN1(LEFILE,
287  "Warning: relative URL <%s> contains scheme, contrary to RFC 1808\n",
288  url);
289  }
290  else
291  {
292  LT_ERROR2(LEFILE,
293  "Error: relative URL <%s> has scheme different from base <%s>\n",
294  url, base);
295  goto bad;
296  }
297  }
298 
299  /* Return the parts and the whole thing */
300 
301  merged_scheme = base_scheme; if(scheme) Free(scheme);
302 
303  if(host)
304  {
305  merged_host = host; Free(base_host);
306  merged_port = port;
307  }
308  else
309  {
310  merged_host = base_host;
311  merged_port = base_port;
312  }
313 
314  Free(path); Free(base_path);
315 
316  merged_url = Malloc(strlen(merged_scheme) + 1 +
317  (merged_host ? 2 + strlen(merged_host) + 10 : 0) +
318  strlen(merged_path) + 1);
319  if(merged_host)
320  {
321  if(merged_port == -1)
322  sprintf(merged_url, "%s://%s%s",
323  merged_scheme, merged_host, merged_path);
324  else
325  sprintf(merged_url, "%s://%s:%d%s",
326  merged_scheme, merged_host, merged_port, merged_path);
327  }
328  else
329  sprintf(merged_url, "%s:%s", merged_scheme, merged_path);
330 
331 ok:
332  Free(default_base);
333  if(_scheme) *_scheme = merged_scheme; else Free(merged_scheme);
334  if(_host) *_host = merged_host; else Free(merged_host);
335  if(_port) *_port = merged_port;
336  if(_path) *_path = merged_path; else Free(merged_path);
337 
338  return merged_url;
339 
340 bad:
341  Free(default_base);
342  Free(scheme);
343  Free(host);
344  Free(path);
345  Free(base_scheme);
346  Free(base_host);
347  Free(base_path);
348 
349  return NULL;
350 }
351 
352 /*
353  * Open a stream to a URL.
354  * url may be a relative URL, in which case it is merged with base,
355  * which is typically the URL of the containing document. If base
356  * is null, file:`pwd`/ is used, which is the right thing to do for
357  * filenames. If base is "", there is no base URL and relative
358  * URLs will fail.
359  * If merged_url is non-null the resulting URL is stored in it.
360  * If type begins "r", the URL is opened for reading, if "w" for
361  * writing. Writing is only supported for file URLs.
362  * If the type begins "rl", the data will be copied to a temporary
363  * file so that seeking is possible (NOT YET IMPLEMENTED).
364  * Returns a FILE16 for success, NULL for failure.
365  */
366 
367 FILE16 *url_open(const char *url, const char *base, const char *type,
368  char **merged_url)
369 {
370  char *scheme, *host, *path, *m_url;
371  int port, i;
372  FILE16 *f;
373 #ifdef HAVE_LIBZ
374  int len, gzipped = 0;
375 #endif
376 
377  /* Determine the merged URL */
378 
379  if(!(m_url = url_merge(url, base, &scheme, &host, &port, &path)))
380  return 0;
381 
382 #ifdef HAVE_LIBZ
383  len = strlen(m_url);
384  if(len > 3 && strcmp8(m_url+len-3, ".gz") == 0)
385  gzipped = 1;
386 #endif
387 
388  /*
389  printf("<%s> <%s> <%d> <%s>\n", scheme, host ? host : "", port, path);
390  printf("%s\n", m_url);
391  */
392 
393  /* Pass to the appropriate opening function */
394 
395  for(i=0; i<NSCHEME; i++)
396  if(strcmp(scheme, schemes[i].scheme) == 0)
397  {
398  f = schemes[i].open(m_url, host, port, path, type);
399 
400  Free(scheme);
401  if(host)
402  Free(host);
403  Free(path);
404 
405  if(!f)
406  return f;
407 
408 #ifdef HAVE_LIBZ
409  if(gzipped)
410  {
411  /* We have a gzip-compressed file which we hand to gzopen
412  * for further processing.
413  */
414  gzFile gfile;
415  FILE *file = GetFILE(f);
416 
417  if(!f)
418  {
419  LT_ERROR1(LEFILE,
420  "Can't attach gzip processor to URL \"%s\"\n",
421  m_url);
422  Free(m_url);
423  return 0;
424  }
425 #ifdef macintosh
426  gfile =gzdopen(dup(fileno(file)), *type == 'r' ? "rb" : "wb");
427 #else
428  gfile = gzdopen(dup(fileno(file)), type);
429 #endif
430  Fclose(f);
431  f = MakeFILE16FromGzip(gfile, type);
432  }
433 #endif
434  if(f && merged_url)
435  *merged_url = m_url;
436  else
437  Free(m_url);
438 
439  return f;
440  }
441 
442  /* Not implemented */
443 
444  LT_ERROR1(LEFILE, "Error: scheme \"%s\" not implemented\n", scheme);
445 
446  Free(scheme);
447  if(host)
448  Free(host);
449  Free(path);
450  Free(m_url);
451 
452  return 0;
453 }
454 
455 /* Open an http URL */
456 
457 static FILE16 *http_open(const char *url,
458  const char *host, int port, const char *path,
459  const char *type)
460 {
461 #ifndef SOCKETS_IMPLEMENTED
462  LT_ERROR(NEUNSUP,
463  "http: URLs are not yet implemented on this platform\n");
464  return 0;
465 #else
466  FILE16 *f16;
467  struct sockaddr_in addr;
468  struct hostent *hostent;
469  int s, server_major, server_minor, status, count, c;
470  char reason[81];
471 #ifndef WIN32
472  FILE *fin,*fout;
473 #else
474  static int inited=0;
475  int i;
476  static char buf[1024];
477  if (!inited)
478  {
479  WORD version = MAKEWORD(1, 1);
480  WSADATA wsaData;
481  int err = WSAStartup(version, &wsaData);
482  if (err)
483  {
484  LT_ERROR(LEFILE, "Error: can't init HTTP interface\n");
485  return 0;
486  }
487  else if(LOBYTE(wsaData.wVersion) != 1 || HIBYTE(wsaData.wVersion) != 1)
488  {
489  LT_ERROR(LEFILE, "Error: wrong version of WINSOCK\n");
490  WSACleanup();
491  return 0;
492  }
493  inited = 1;
494  }
495 #endif
496 
497  if(*type != 'r')
498  {
499  LT_ERROR1(LEFILE, "Error: can't open http URL \"%s\" for writing\n",
500  url);
501  return 0;
502  }
503 
504  if(!host)
505  {
506  LT_ERROR1(LEFILE, "Error: no host part in http URL \"%s\"\n", url);
507  return 0;
508  }
509 
510  /* Create the socket */
511 
512  s = socket(PF_INET, SOCK_STREAM, 0);
513 #ifdef WIN32
514  if (s == INVALID_SOCKET) {
515  LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n",
516  WSAGetLastError());
517  };
518 #else
519  if(s == -1) {
520  LT_ERROR1(LEFILE, "Error: system call socket failed: %s\n",
521  Strerror());
522  return 0;
523  };
524 #endif
525 
526  /* Find the server address */
527 
528  hostent = gethostbyname(host);
529  if(!hostent)
530  {
531  LT_ERROR1(LEFILE,
532  "Error: can't find address for host in http URL \"%s\"\n",
533  url);
534  return 0;
535  }
536 
537  memset(&addr, 0, sizeof(addr));
538  addr.sin_family = AF_INET;
539  /* If we were really enthusiastic, we would try all the host's addresses */
540  memcpy(&addr.sin_addr, hostent->h_addr, hostent->h_length);
541  addr.sin_port = htons((u_short)(port == -1 ? 80 : port));
542 
543  /* Connect */
544 
545  if(connect(s, (struct sockaddr *)&addr, sizeof(addr)) == -1)
546  {
547  LT_ERROR1(LEFILE, "Error: system call connect failed: %s\n",
548  Strerror());
549  return 0;
550  }
551 
552 #ifndef WIN32
553 #ifdef macintosh
554  fin = fdopen(s, "rb");
555  setvbuf(fin, 0, _IONBF, 0);
556  fout = fdopen(dup(s), "wb");
557 #else
558  fin = fdopen(s, "r");
559  setvbuf(fin, 0, _IONBF, 0);
560  fout = fdopen(dup(s), "w");
561 #endif
562 #endif
563 
564  /* Send the request */
565 
566  /*
567  * Apparently on the Macintosh, \n might not be ASCII LF, so we'll
568  * use numerics to be sure.
569  */
570 
571 #ifdef WIN32
572  sprintf(buf, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015",
573  path);
574  if (send(s,buf,strlen8(buf),0)==SOCKET_ERROR) {
575  LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n",
576  WSAGetLastError());
577  /* XXX close the socket? */
578  return 0;
579  };
580 #else
581  fprintf(fout, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015",
582  path);
583 
584  /* We used to test for errors after doing fclose, but this seemed
585  to produce spurious errors under Linux (RedHat 4.2), so now we
586  do fflush and test after that. */
587 
588  fflush(fout);
589  if(ferror(fout))
590  {
591  LT_ERROR1(LEWRTF, "Error: write to socket failed: %s\n",Strerror());
592  fclose(fout);
593  fclose(fin);
594  return 0;
595  }
596  fclose(fout);
597 #endif
598 
599  /* Read the status line */
600 #ifdef WIN32
601  for(i=0; i<sizeof(buf)-1; i++)
602  {
603  if(recv(s, &buf[i], 1, 0) != 1)
604  LT_ERROR1(LEFILE,
605  "Error: recv error from server for URL \"%s\"\n",
606  url);
607  if(buf[i] == '\n')
608  break;
609  }
610  count=sscanf(buf, "HTTP/%d.%d %d %80[^\012]",
611  &server_major, &server_minor, &status, reason);
612 #else
613  count=fscanf(fin, "HTTP/%d.%d %d %80[^\012]",
614  &server_major, &server_minor, &status, reason);
615 #endif
616 
617  if(count != 4)
618  {
619  LT_ERROR3(LEFILE,
620  "Error: bad header from server for URL \"%s\"\n%d %s\n",
621  url, count, Strerror());
622 #ifndef WIN32
623  fclose(fin);
624 #endif
625  return 0;
626  }
627 
628  if(status != 200)
629  {
630  /* We should handle 301 (redirection) but we don't */
631  LT_ERROR3(LEFILE, "Error: can't retrieve \"%s\": %d %s\n",
632  url, status, reason);
633 #ifndef WIN32
634  fclose(fin);
635 #endif
636  return 0;
637  }
638 
639  /* Skip other headers */
640 
641  count = 0;
642 #ifdef WIN32
643  while(recv(s, buf, 1, 0) == 1 && (c = buf[0], 1) || (c = EOF, 0))
644 #else
645  while((c = getc(fin)) != EOF)
646 #endif
647  {
648  if(c == '\012')
649  count++;
650  else if(c != '\015')
651  count = 0;
652  if(count == 2)
653  break;
654  }
655 
656  if(c == EOF)
657  {
658  LT_ERROR1(LEFILE, "Error: EOF in headers retrieving \"%s\"\n", url);
659 #ifndef WIN32
660  fclose(fin);
661 #endif
662  return 0;
663  }
664 
665 #ifdef WIN32
666  f16 = MakeFILE16FromWinsock(s, type);
667 #else
668  f16 = MakeFILE16FromFILE(fin, type);
669 #endif
670 
671  SetCloseUnderlying(f16, 1);
672  return f16;
673 #endif /* SOCKETS_IMPLEMENTED */
674 }
675 
676 /* Open a file URL (easy, at least on unix) */
677 
678 static FILE16 *file_open(const char *url,
679  const char *host, int port, const char *path,
680  const char *type)
681 {
682  FILE *f;
683  FILE16 *f16;
684  char *file;
685 
686  if(host && host[0])
687  WARN1(LEFILE, "Warning: ignoring host part in file URL \"%s\"\n", url);
688 
689 #ifdef WIN32
690 
691  /* DOS: translate /C:/a/b.c to C:\a\b.c */
692 
693  if(path[0] == '/' && path[1] && path[2] == ':')
694  path++;
695 
696  file = strdup8(path);
697  {
698  char *p;
699  for(p=file; *p; p++)
700  if(*p == '/')
701  *p = '\\';
702  }
703 
704 #else
705 #ifdef mac_filenames
706 
707  /* Mac: translate /a/b.c to a:b.c */
708 
709  if(*path == '/')
710  path++;
711 
712  file = strdup8(path);
713  {
714  char *p;
715  for(p=file; *p; p++)
716  if(*p == '/')
717  *p = ':';
718  }
719 #else
720 
721  /* Unix: a path is a path is a path! */
722 
723  file = strdup8(path);
724 
725 #endif
726 #endif
727 
728  /* XXX should undo any escapes */
729 
730  f = fopen(file, type);
731  if(!f)
732  {
733  perror(file);
734  Free(file);
735  return 0;
736  }
737 
738  Free(file);
739 
740  f16 = MakeFILE16FromFILE(f, type);
741  SetCloseUnderlying(f16, 1);
742 
743  return f16;
744 }
745 
746 static void parse_url(const char *url,
747  char **scheme, char **host, int *port, char **path)
748 {
749  char *p, *q;
750  int warned = 0;
751 
752  *scheme = *host = *path = 0;
753  *port = -1;
754 
755  /* Does it start with a scheme? */
756 
757  for(p = (char *)url; *p; p++)
758  if(*p == ':' || *p == '/')
759  break;
760 
761  if(p > url && *p == ':')
762  {
763  *scheme = Malloc(p - url + 1);
764  strncpy(*scheme, url, p - url);
765  (*scheme)[p - url] = '\0';
766  url = p+1;
767  }
768 
769  /* Does it have a net_loc? */
770 
771  if(url[0] == '/' && url[1] == '/')
772  {
773  url += 2;
774 
775  for(p = (char *)url; *p; p++)
776  if(*p == '/')
777  break;
778 
779  /* Does it have a port number? */
780 
781  for(q = p-1; q >= url; q--)
782  if(!isdigit((int)*q))
783  break;
784 
785  if(q < p-1 && *q == ':')
786  *port = atoi(q+1);
787  else
788  q = p;
789 
790  *host = Malloc(q - url + 1);
791  strncpy(*host, url, q - url);
792  (*host)[q - url] = '\0';
793  url = p;
794  }
795 
796  /* The rest is the path */
797 
798  if(*url)
799  *path = strdup8(url);
800  else
801  *path = strdup8("/");
802 
803  /* Windoze users have a tendency to use backslashes instead of slashes */
804 
805  for(p=*path; *p; p++)
806  if(*p == '\\')
807  {
808  if(!warned)
809  {
810  WARN1(LEFILE, "Warning: illegal backslashes in URL path \"%s\""
811  "replaced by slashes\n", url);
812  warned = 1;
813  }
814 
815  *p = '/';
816  }
817 }
818