Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
charset.c
1 /*************************************************************************/
2 /* */
3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4 /* University of Edinburgh. */
5 /* */
6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13 /* */
14 /*************************************************************************/
15 #include <stdio.h>
16 #include <stdlib.h>
17 
18 #ifdef FOR_LT
19 
20 #include "lt-memory.h"
21 
22 #define Malloc salloc
23 
24 #else
25 
26 #include "system.h"
27 
28 #endif
29 
30 #include "charset.h"
31 #include "string16.h"
32 
33 int iso_to_unicode[8][256]; /* latin-2 ... latin-9 */
34 int iso_max_val[8];
35 char8 *unicode_to_iso[8];
36 
37 /* This table is used to initialise the above arrays */
38 
39 static int latin_table[8][96] = {
40 
41 /* latin2 */
42 {
43 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
44 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
45 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
46 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
47 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
48 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
49 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
50 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
51 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
52 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
53 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
54 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
55 },
56 
57 /* latin3 */
58 {
59 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -00001, 0x0124, 0x00a7,
60 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -00001, 0x017b,
61 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,
62 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -00001, 0x017c,
63 0x00c0, 0x00c1, 0x00c2, -00001, 0x00c4, 0x010a, 0x0108, 0x00c7,
64 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
65 -00001, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,
66 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
67 0x00e0, 0x00e1, 0x00e2, -00001, 0x00e4, 0x010b, 0x0109, 0x00e7,
68 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
69 -00001, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,
70 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,
71 },
72 
73 /* latin4 */
74 {
75 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,
76 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
77 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,
78 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
79 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,
80 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
81 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
82 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
83 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,
84 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
85 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
86 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,
87 },
88 
89 /* latin5 */
90 {
91 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
92 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f,
93 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
94 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
95 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
96 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
97 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
98 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
99 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
100 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
101 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
102 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f,
103 },
104 
105 /* latin6 */
106 {
107 0x00a0, -00001, -00001, -00001, 0x00a4, -00001, -00001, -00001,
108 -00001, -00001, -00001, -00001, 0x060c, 0x00ad, -00001, -00001,
109 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
110 -00001, -00001, -00001, 0x061b, -00001, -00001, -00001, 0x061f,
111 -00001, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
112 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,
113 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
114 0x0638, 0x0639, 0x063a, -00001, -00001, -00001, -00001, -00001,
115 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
116 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,
117 0x0650, 0x0651, 0x0652, -00001, -00001, -00001, -00001, -00001,
118 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
119 },
120 
121 /* latin7 */
122 {
123 0x00a0, 0x02bd, 0x02bc, 0x00a3, -00001, -00001, 0x00a6, 0x00a7,
124 0x00a8, 0x00a9, -00001, 0x00ab, 0x00ac, 0x00ad, -00001, 0x2015,
125 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,
126 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
127 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
128 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
129 0x03a0, 0x03a1, -00001, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
130 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
131 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
132 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
133 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
134 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, -00001,
135 },
136 
137 /* latin8 */
138 {
139 0x00a0, -00001, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
140 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x203e,
141 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
142 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, -00001,
143 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
144 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
145 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
146 -00001, -00001, -00001, -00001, -00001, -00001, -00001, 0x2017,
147 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,
148 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,
149 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,
150 0x05e8, 0x05e9, 0x05ea, -00001, -00001, -00001, -00001, -00001,
151 },
152 
153 /* latin9 */
154 {
155 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
156 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
157 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
158 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
159 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
160 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
161 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
162 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,
163 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
164 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
165 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
166 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff,
167 }
168 };
169 
170 const char8 *CharacterEncodingName[CE_enum_count] = {
171  "unknown",
172  "unspecified-ascii-superset",
173 
174  "UTF-8",
175  "ISO-646",
176 
177  "ISO-8859-1",
178  "ISO-8859-2",
179  "ISO-8859-3",
180  "ISO-8859-4",
181  "ISO-8859-5",
182  "ISO-8859-6",
183  "ISO-8859-7",
184  "ISO-8859-8",
185  "ISO-8859-9",
186 
187  "UTF-16",
188  "UTF-16",
189  "ISO-10646-UCS-2",
190  "ISO-10646-UCS-2",
191 };
192 
193 const char8 *CharacterEncodingNameAndByteOrder[CE_enum_count] = {
194  "unknown",
195  "unspecified_ascii_superset",
196 
197  "UTF-8",
198  "ISO-646",
199 
200  "ISO-8859-1",
201  "ISO-8859-2",
202  "ISO-8859-3",
203  "ISO-8859-4",
204  "ISO-8859-5",
205  "ISO-8859-6",
206  "ISO-8859-7",
207  "ISO-8859-8",
208  "ISO-8859-9",
209 
210  "UTF-16-B",
211  "UTF-16-L",
212  "ISO-10646-UCS-2-B",
213  "ISO-10646-UCS-2-L",
214 };
215 
216 struct character_encoding_alias CharacterEncodingAlias[] = {
217  {"ASCII", CE_ISO_646},
218  {"ISO-Latin-1", CE_ISO_8859_1},
219  {"ISO-Latin-2", CE_ISO_8859_2},
220  {"ISO-Latin-3", CE_ISO_8859_3},
221  {"ISO-Latin-4", CE_ISO_8859_4},
222  {"ISO-Latin-5", CE_ISO_8859_5},
223  {"ISO-Latin-6", CE_ISO_8859_6},
224  {"ISO-Latin-7", CE_ISO_8859_7},
225  {"ISO-Latin-8", CE_ISO_8859_8},
226  {"UCS-2", CE_ISO_10646_UCS_2B},
227 };
228 const int CE_alias_count =
229  sizeof(CharacterEncodingAlias)/sizeof(CharacterEncodingAlias[0]);
230 
231 CharacterEncoding InternalCharacterEncoding;
232 
233 void init_charset(void)
234 {
235  int i, j;
236  union {char b[2]; short s;} bytes;
237 
238  /* Determine internal encoding */
239 
240  bytes.s = 1;
241 
242 #if CHAR_SIZE == 8
243  InternalCharacterEncoding = CE_unspecified_ascii_superset;
244 #else
245  InternalCharacterEncoding = (bytes.b[0] == 0) ? CE_UTF_16B : CE_UTF_16L;
246 #endif
247 
248  /* Make ISO-Latin-N tables */
249 
250  for(i=0; i<8; i++)
251  {
252  int max = 0x9f;
253 
254  for(j=0; j<0xa0; j++)
255  iso_to_unicode[i][j] = j;
256  for(j=0xa0; j<0x100; j++)
257  {
258  int code = latin_table[i][j-0xa0];
259  iso_to_unicode[i][j] = code;
260  if(code > max) max = code;
261  }
262 
263  iso_max_val[i] = max;
264 
265  if(!(unicode_to_iso[i] = Malloc(max+1)))
266  {
267  fprintf(stderr, "Malloc failed in charset initialisation\n");
268  exit(1);
269  }
270 
271  for(j=0; j<0xa0; j++)
272  unicode_to_iso[i][j] = j;
273  for(j=0xa0; j<=max; j++)
274  unicode_to_iso[i][j] = '?';
275  for(j=0xa0; j<0x100; j++)
276  {
277  int code = latin_table[i][j-0xa0];
278  if(code != -1)
279  unicode_to_iso[i][code] = j;
280  }
281  }
282 }
283 
284 /* Return true if the encoding has 8-bit input units and is the same
285  as ascii for characters <= 127 */
286 
287 int EncodingIsAsciiSuperset(CharacterEncoding enc)
288 {
289  return enc >= CE_unspecified_ascii_superset && enc <= CE_ISO_8859_9;
290 }
291 
292 /*
293  * Return true if enc1 and enc2 have the same size input units, and are
294  * the same for Unicode <= 127.
295  * If so, *enc3 is set to enc2 modified to have the same byte order as enc1.
296  */
297 
298 int EncodingsCompatible(CharacterEncoding enc1, CharacterEncoding enc2,
299  CharacterEncoding *enc3)
300 {
301  if(EncodingIsAsciiSuperset(enc1))
302  {
303  if(EncodingIsAsciiSuperset(enc2))
304  {
305  *enc3 = enc2;
306  return 1;
307  }
308  return 0;
309  }
310 
311  if(enc1 == CE_UTF_16B || enc1 == CE_ISO_10646_UCS_2B)
312  {
313  if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
314  *enc3 = CE_UTF_16B;
315  else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
316  *enc3 = CE_ISO_10646_UCS_2B;
317  else
318  return 0;
319  return 1;
320  }
321 
322  if(enc1 == CE_UTF_16L || enc1 == CE_ISO_10646_UCS_2L)
323  {
324  if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
325  *enc3 = CE_UTF_16L;
326  else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
327  *enc3 = CE_ISO_10646_UCS_2L;
328  else
329  return 0;
330  return 1;
331  }
332 
333  return 0;
334 }
335 
336 CharacterEncoding FindEncoding(char8 *name)
337 {
338  int i;
339 
340  for(i=0; i<CE_enum_count; i++)
341  if(strcasecmp8(name, CharacterEncodingNameAndByteOrder[i]) == 0)
342  return (CharacterEncoding)i;
343 
344  for(i=0; i<CE_enum_count; i++)
345  if(strcasecmp8(name, CharacterEncodingName[i]) == 0)
346  return (CharacterEncoding)i;
347 
348  for(i=0; i<CE_alias_count; i++)
349  if(strcasecmp8(name, CharacterEncodingAlias[i].name) == 0)
350  return CharacterEncodingAlias[i].enc;
351 
352  return CE_unknown;
353 }
354