Mercurial > hg
annotate mcabber/libjabber/xmltok.c @ 1197:6f602d3270a4
Add /pgp [-]force
With this command it becomes possible to enforce PGP encryption without
checking if the remote client has PGP support. It can be used to send
encrypted offline messages too.
author | Mikael Berthe <mikael@lilotux.net> |
---|---|
date | Fri, 27 Apr 2007 00:37:57 +0200 |
parents | 0aa9015f06df |
children |
rev | line source |
---|---|
25 | 1 /* |
2 The contents of this file are subject to the Mozilla Public License | |
3 Version 1.1 (the "License"); you may not use this file except in | |
4 compliance with the License. You may obtain a copy of the License at | |
5 http://www.mozilla.org/MPL/ | |
6 | |
7 Software distributed under the License is distributed on an "AS IS" | |
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the | |
9 License for the specific language governing rights and limitations | |
10 under the License. | |
11 | |
12 The Original Code is expat. | |
13 | |
14 The Initial Developer of the Original Code is James Clark. | |
15 Portions created by James Clark are Copyright (C) 1998, 1999 | |
16 James Clark. All Rights Reserved. | |
17 | |
18 Contributor(s): | |
19 | |
20 Alternatively, the contents of this file may be used under the terms | |
21 of the GNU General Public License (the "GPL"), in which case the | |
22 provisions of the GPL are applicable instead of those above. If you | |
23 wish to allow use of your version of this file only under the terms of | |
24 the GPL and not to allow others to use your version of this file under | |
25 the MPL, indicate your decision by deleting the provisions above and | |
26 replace them with the notice and other provisions required by the | |
27 GPL. If you do not delete the provisions above, a recipient may use | |
28 your version of this file under either the MPL or the GPL. | |
29 */ | |
30 | |
31 #include "xmldef.h" | |
32 #include "xmltok.h" | |
33 #include "nametab.h" | |
34 | |
35 #define VTABLE1 \ | |
36 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \ | |
37 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ | |
38 PREFIX(sameName), \ | |
39 PREFIX(nameMatchesAscii), \ | |
40 PREFIX(nameLength), \ | |
41 PREFIX(skipS), \ | |
42 PREFIX(getAtts), \ | |
43 PREFIX(charRefNumber), \ | |
44 PREFIX(predefinedEntityName), \ | |
45 PREFIX(updatePosition), \ | |
46 PREFIX(isPublicId) | |
47 | |
48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) | |
49 | |
50 #define UCS2_GET_NAMING(pages, hi, lo) \ | |
51 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) | |
52 | |
53 /* A 2 byte UTF-8 representation splits the characters 11 bits | |
54 between the bottom 5 and 6 bits of the bytes. | |
55 We need 8 bits to index into pages, 3 bits to add to that index and | |
56 5 bits to generate the mask. */ | |
57 #define UTF8_GET_NAMING2(pages, byte) \ | |
58 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ | |
59 + ((((byte)[0]) & 3) << 1) \ | |
60 + ((((byte)[1]) >> 5) & 1)] \ | |
61 & (1 << (((byte)[1]) & 0x1F))) | |
62 | |
63 /* A 3 byte UTF-8 representation splits the characters 16 bits | |
64 between the bottom 4, 6 and 6 bits of the bytes. | |
65 We need 8 bits to index into pages, 3 bits to add to that index and | |
66 5 bits to generate the mask. */ | |
67 #define UTF8_GET_NAMING3(pages, byte) \ | |
68 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ | |
69 + ((((byte)[1]) >> 2) & 0xF)] \ | |
70 << 3) \ | |
71 + ((((byte)[1]) & 3) << 1) \ | |
72 + ((((byte)[2]) >> 5) & 1)] \ | |
73 & (1 << (((byte)[2]) & 0x1F))) | |
74 | |
75 #define UTF8_GET_NAMING(pages, p, n) \ | |
76 ((n) == 2 \ | |
77 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ | |
78 : ((n) == 3 \ | |
79 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ | |
80 : 0)) | |
81 | |
82 #define UTF8_INVALID3(p) \ | |
83 ((*p) == 0xED \ | |
84 ? (((p)[1] & 0x20) != 0) \ | |
85 : ((*p) == 0xEF \ | |
86 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \ | |
87 : 0)) | |
88 | |
89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) | |
90 | |
91 static | |
92 int isNever(const ENCODING *enc, const char *p) | |
93 { | |
94 return 0; | |
95 } | |
96 | |
97 static | |
98 int utf8_isName2(const ENCODING *enc, const char *p) | |
99 { | |
100 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); | |
101 } | |
102 | |
103 static | |
104 int utf8_isName3(const ENCODING *enc, const char *p) | |
105 { | |
106 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); | |
107 } | |
108 | |
109 #define utf8_isName4 isNever | |
110 | |
111 static | |
112 int utf8_isNmstrt2(const ENCODING *enc, const char *p) | |
113 { | |
114 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); | |
115 } | |
116 | |
117 static | |
118 int utf8_isNmstrt3(const ENCODING *enc, const char *p) | |
119 { | |
120 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); | |
121 } | |
122 | |
123 #define utf8_isNmstrt4 isNever | |
124 | |
125 #define utf8_isInvalid2 isNever | |
126 | |
127 static | |
128 int utf8_isInvalid3(const ENCODING *enc, const char *p) | |
129 { | |
130 return UTF8_INVALID3((const unsigned char *)p); | |
131 } | |
132 | |
133 static | |
134 int utf8_isInvalid4(const ENCODING *enc, const char *p) | |
135 { | |
136 return UTF8_INVALID4((const unsigned char *)p); | |
137 } | |
138 | |
139 struct normal_encoding { | |
140 ENCODING enc; | |
141 unsigned char type[256]; | |
142 #ifdef XML_MIN_SIZE | |
143 int (*byteType)(const ENCODING *, const char *); | |
144 int (*isNameMin)(const ENCODING *, const char *); | |
145 int (*isNmstrtMin)(const ENCODING *, const char *); | |
146 int (*byteToAscii)(const ENCODING *, const char *); | |
147 int (*charMatches)(const ENCODING *, const char *, int); | |
148 #endif /* XML_MIN_SIZE */ | |
149 int (*isName2)(const ENCODING *, const char *); | |
150 int (*isName3)(const ENCODING *, const char *); | |
151 int (*isName4)(const ENCODING *, const char *); | |
152 int (*isNmstrt2)(const ENCODING *, const char *); | |
153 int (*isNmstrt3)(const ENCODING *, const char *); | |
154 int (*isNmstrt4)(const ENCODING *, const char *); | |
155 int (*isInvalid2)(const ENCODING *, const char *); | |
156 int (*isInvalid3)(const ENCODING *, const char *); | |
157 int (*isInvalid4)(const ENCODING *, const char *); | |
158 }; | |
159 | |
160 #ifdef XML_MIN_SIZE | |
161 | |
162 #define STANDARD_VTABLE(E) \ | |
163 E ## byteType, \ | |
164 E ## isNameMin, \ | |
165 E ## isNmstrtMin, \ | |
166 E ## byteToAscii, \ | |
167 E ## charMatches, | |
168 | |
169 #else | |
170 | |
171 #define STANDARD_VTABLE(E) /* as nothing */ | |
172 | |
173 #endif | |
174 | |
175 #define NORMAL_VTABLE(E) \ | |
176 E ## isName2, \ | |
177 E ## isName3, \ | |
178 E ## isName4, \ | |
179 E ## isNmstrt2, \ | |
180 E ## isNmstrt3, \ | |
181 E ## isNmstrt4, \ | |
182 E ## isInvalid2, \ | |
183 E ## isInvalid3, \ | |
184 E ## isInvalid4 | |
185 | |
186 static int checkCharRefNumber(int); | |
187 | |
188 #include "xmltok_impl.h" | |
189 | |
190 #ifdef XML_MIN_SIZE | |
191 #define sb_isNameMin isNever | |
192 #define sb_isNmstrtMin isNever | |
193 #endif | |
194 | |
195 #ifdef XML_MIN_SIZE | |
196 #define MINBPC(enc) ((enc)->minBytesPerChar) | |
197 #else | |
198 /* minimum bytes per character */ | |
199 #define MINBPC(enc) 1 | |
200 #endif | |
201 | |
202 #define SB_BYTE_TYPE(enc, p) \ | |
203 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) | |
204 | |
205 #ifdef XML_MIN_SIZE | |
206 static | |
207 int sb_byteType(const ENCODING *enc, const char *p) | |
208 { | |
209 return SB_BYTE_TYPE(enc, p); | |
210 } | |
211 #define BYTE_TYPE(enc, p) \ | |
212 (((const struct normal_encoding *)(enc))->byteType(enc, p)) | |
213 #else | |
214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) | |
215 #endif | |
216 | |
217 #ifdef XML_MIN_SIZE | |
218 #define BYTE_TO_ASCII(enc, p) \ | |
219 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p)) | |
220 static | |
221 int sb_byteToAscii(const ENCODING *enc, const char *p) | |
222 { | |
223 return *p; | |
224 } | |
225 #else | |
226 #define BYTE_TO_ASCII(enc, p) (*p) | |
227 #endif | |
228 | |
229 #define IS_NAME_CHAR(enc, p, n) \ | |
230 (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) | |
231 #define IS_NMSTRT_CHAR(enc, p, n) \ | |
232 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) | |
233 #define IS_INVALID_CHAR(enc, p, n) \ | |
234 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) | |
235 | |
236 #ifdef XML_MIN_SIZE | |
237 #define IS_NAME_CHAR_MINBPC(enc, p) \ | |
238 (((const struct normal_encoding *)(enc))->isNameMin(enc, p)) | |
239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |
240 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p)) | |
241 #else | |
242 #define IS_NAME_CHAR_MINBPC(enc, p) (0) | |
243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) | |
244 #endif | |
245 | |
246 #ifdef XML_MIN_SIZE | |
247 #define CHAR_MATCHES(enc, p, c) \ | |
248 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c)) | |
249 static | |
250 int sb_charMatches(const ENCODING *enc, const char *p, int c) | |
251 { | |
252 return *p == c; | |
253 } | |
254 #else | |
255 /* c is an ASCII character */ | |
256 #define CHAR_MATCHES(enc, p, c) (*(p) == c) | |
257 #endif | |
258 | |
259 #define PREFIX(ident) normal_ ## ident | |
260 #include "xmltok_impl_c.h" | |
261 | |
262 #undef MINBPC | |
263 #undef BYTE_TYPE | |
264 #undef BYTE_TO_ASCII | |
265 #undef CHAR_MATCHES | |
266 #undef IS_NAME_CHAR | |
267 #undef IS_NAME_CHAR_MINBPC | |
268 #undef IS_NMSTRT_CHAR | |
269 #undef IS_NMSTRT_CHAR_MINBPC | |
270 #undef IS_INVALID_CHAR | |
271 | |
272 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ | |
273 UTF8_cval1 = 0x00, | |
274 UTF8_cval2 = 0xc0, | |
275 UTF8_cval3 = 0xe0, | |
276 UTF8_cval4 = 0xf0 | |
277 }; | |
278 | |
279 static | |
280 void utf8_toUtf8(const ENCODING *enc, | |
281 const char **fromP, const char *fromLim, | |
282 char **toP, const char *toLim) | |
283 { | |
284 char *to; | |
285 const char *from; | |
286 if (fromLim - *fromP > toLim - *toP) { | |
287 /* Avoid copying partial characters. */ | |
288 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) | |
289 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) | |
290 break; | |
291 } | |
292 for (to = *toP, from = *fromP; from != fromLim; from++, to++) | |
293 *to = *from; | |
294 *fromP = from; | |
295 *toP = to; | |
296 } | |
297 | |
298 static | |
299 void utf8_toUtf16(const ENCODING *enc, | |
300 const char **fromP, const char *fromLim, | |
301 unsigned short **toP, const unsigned short *toLim) | |
302 { | |
303 unsigned short *to = *toP; | |
304 const char *from = *fromP; | |
305 while (from != fromLim && to != toLim) { | |
306 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { | |
307 case BT_LEAD2: | |
308 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f); | |
309 from += 2; | |
310 break; | |
311 case BT_LEAD3: | |
312 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f); | |
313 from += 3; | |
314 break; | |
315 case BT_LEAD4: | |
316 { | |
317 unsigned long n; | |
318 if (to + 1 == toLim) | |
319 break; | |
320 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); | |
321 n -= 0x10000; | |
322 to[0] = (unsigned short)((n >> 10) | 0xD800); | |
323 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); | |
324 to += 2; | |
325 from += 4; | |
326 } | |
327 break; | |
328 default: | |
329 *to++ = *from++; | |
330 break; | |
331 } | |
332 } | |
333 *fromP = from; | |
334 *toP = to; | |
335 } | |
336 | |
337 #ifdef XML_NS | |
338 static const struct normal_encoding utf8_encoding_ns = { | |
339 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
340 { | |
341 #include "asciitab.h" | |
342 #include "utf8tab.h" | |
343 }, | |
344 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
345 }; | |
346 #endif | |
347 | |
348 static const struct normal_encoding utf8_encoding = { | |
349 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
350 { | |
351 #define BT_COLON BT_NMSTRT | |
352 #include "asciitab.h" | |
353 #undef BT_COLON | |
354 #include "utf8tab.h" | |
355 }, | |
356 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
357 }; | |
358 | |
359 #ifdef XML_NS | |
360 | |
361 static const struct normal_encoding internal_utf8_encoding_ns = { | |
362 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
363 { | |
364 #include "iasciitab.h" | |
365 #include "utf8tab.h" | |
366 }, | |
367 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
368 }; | |
369 | |
370 #endif | |
371 | |
372 static const struct normal_encoding internal_utf8_encoding = { | |
373 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |
374 { | |
375 #define BT_COLON BT_NMSTRT | |
376 #include "iasciitab.h" | |
377 #undef BT_COLON | |
378 #include "utf8tab.h" | |
379 }, | |
380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) | |
381 }; | |
382 | |
383 static | |
384 void latin1_toUtf8(const ENCODING *enc, | |
385 const char **fromP, const char *fromLim, | |
386 char **toP, const char *toLim) | |
387 { | |
388 for (;;) { | |
389 unsigned char c; | |
390 if (*fromP == fromLim) | |
391 break; | |
392 c = (unsigned char)**fromP; | |
393 if (c & 0x80) { | |
394 if (toLim - *toP < 2) | |
395 break; | |
396 *(*toP)++ = ((c >> 6) | UTF8_cval2); | |
397 *(*toP)++ = ((c & 0x3f) | 0x80); | |
398 (*fromP)++; | |
399 } | |
400 else { | |
401 if (*toP == toLim) | |
402 break; | |
403 *(*toP)++ = *(*fromP)++; | |
404 } | |
405 } | |
406 } | |
407 | |
408 static | |
409 void latin1_toUtf16(const ENCODING *enc, | |
410 const char **fromP, const char *fromLim, | |
411 unsigned short **toP, const unsigned short *toLim) | |
412 { | |
413 while (*fromP != fromLim && *toP != toLim) | |
414 *(*toP)++ = (unsigned char)*(*fromP)++; | |
415 } | |
416 | |
417 #ifdef XML_NS | |
418 | |
419 static const struct normal_encoding latin1_encoding_ns = { | |
420 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |
421 { | |
422 #include "asciitab.h" | |
423 #include "latin1tab.h" | |
424 }, | |
425 STANDARD_VTABLE(sb_) | |
426 }; | |
427 | |
428 #endif | |
429 | |
430 static const struct normal_encoding latin1_encoding = { | |
431 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |
432 { | |
433 #define BT_COLON BT_NMSTRT | |
434 #include "asciitab.h" | |
435 #undef BT_COLON | |
436 #include "latin1tab.h" | |
437 }, | |
438 STANDARD_VTABLE(sb_) | |
439 }; | |
440 | |
441 static | |
442 void ascii_toUtf8(const ENCODING *enc, | |
443 const char **fromP, const char *fromLim, | |
444 char **toP, const char *toLim) | |
445 { | |
446 while (*fromP != fromLim && *toP != toLim) | |
447 *(*toP)++ = *(*fromP)++; | |
448 } | |
449 | |
450 #ifdef XML_NS | |
451 | |
452 static const struct normal_encoding ascii_encoding_ns = { | |
453 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |
454 { | |
455 #include "asciitab.h" | |
456 /* BT_NONXML == 0 */ | |
457 }, | |
458 STANDARD_VTABLE(sb_) | |
459 }; | |
460 | |
461 #endif | |
462 | |
463 static const struct normal_encoding ascii_encoding = { | |
464 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |
465 { | |
466 #define BT_COLON BT_NMSTRT | |
467 #include "asciitab.h" | |
468 #undef BT_COLON | |
469 /* BT_NONXML == 0 */ | |
470 }, | |
471 STANDARD_VTABLE(sb_) | |
472 }; | |
473 | |
474 static int unicode_byte_type(char hi, char lo) | |
475 { | |
476 switch ((unsigned char)hi) { | |
477 case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |
478 return BT_LEAD4; | |
479 case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |
480 return BT_TRAIL; | |
481 case 0xFF: | |
482 switch ((unsigned char)lo) { | |
483 case 0xFF: | |
484 case 0xFE: | |
485 return BT_NONXML; | |
486 } | |
487 break; | |
488 } | |
489 return BT_NONASCII; | |
490 } | |
491 | |
492 #define DEFINE_UTF16_TO_UTF8(E) \ | |
493 static \ | |
494 void E ## toUtf8(const ENCODING *enc, \ | |
495 const char **fromP, const char *fromLim, \ | |
496 char **toP, const char *toLim) \ | |
497 { \ | |
498 const char *from; \ | |
499 for (from = *fromP; from != fromLim; from += 2) { \ | |
500 int plane; \ | |
501 unsigned char lo2; \ | |
502 unsigned char lo = GET_LO(from); \ | |
503 unsigned char hi = GET_HI(from); \ | |
504 switch (hi) { \ | |
505 case 0: \ | |
506 if (lo < 0x80) { \ | |
507 if (*toP == toLim) { \ | |
508 *fromP = from; \ | |
509 return; \ | |
510 } \ | |
511 *(*toP)++ = lo; \ | |
512 break; \ | |
513 } \ | |
514 /* fall through */ \ | |
515 case 0x1: case 0x2: case 0x3: \ | |
516 case 0x4: case 0x5: case 0x6: case 0x7: \ | |
517 if (toLim - *toP < 2) { \ | |
518 *fromP = from; \ | |
519 return; \ | |
520 } \ | |
521 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ | |
522 *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |
523 break; \ | |
524 default: \ | |
525 if (toLim - *toP < 3) { \ | |
526 *fromP = from; \ | |
527 return; \ | |
528 } \ | |
529 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ | |
530 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ | |
531 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ | |
532 *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |
533 break; \ | |
534 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ | |
535 if (toLim - *toP < 4) { \ | |
536 *fromP = from; \ | |
537 return; \ | |
538 } \ | |
539 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ | |
540 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ | |
541 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ | |
542 from += 2; \ | |
543 lo2 = GET_LO(from); \ | |
544 *(*toP)++ = (((lo & 0x3) << 4) \ | |
545 | ((GET_HI(from) & 0x3) << 2) \ | |
546 | (lo2 >> 6) \ | |
547 | 0x80); \ | |
548 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ | |
549 break; \ | |
550 } \ | |
551 } \ | |
552 *fromP = from; \ | |
553 } | |
554 | |
555 #define DEFINE_UTF16_TO_UTF16(E) \ | |
556 static \ | |
557 void E ## toUtf16(const ENCODING *enc, \ | |
558 const char **fromP, const char *fromLim, \ | |
559 unsigned short **toP, const unsigned short *toLim) \ | |
560 { \ | |
561 /* Avoid copying first half only of surrogate */ \ | |
562 if (fromLim - *fromP > ((toLim - *toP) << 1) \ | |
563 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ | |
564 fromLim -= 2; \ | |
565 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ | |
566 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ | |
567 } | |
568 | |
569 #define SET2(ptr, ch) \ | |
570 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) | |
571 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) | |
572 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) | |
573 | |
574 DEFINE_UTF16_TO_UTF8(little2_) | |
575 DEFINE_UTF16_TO_UTF16(little2_) | |
576 | |
577 #undef SET2 | |
578 #undef GET_LO | |
579 #undef GET_HI | |
580 | |
581 #define SET2(ptr, ch) \ | |
582 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) | |
583 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) | |
584 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) | |
585 | |
586 DEFINE_UTF16_TO_UTF8(big2_) | |
587 DEFINE_UTF16_TO_UTF16(big2_) | |
588 | |
589 #undef SET2 | |
590 #undef GET_LO | |
591 #undef GET_HI | |
592 | |
593 #define LITTLE2_BYTE_TYPE(enc, p) \ | |
594 ((p)[1] == 0 \ | |
595 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ | |
596 : unicode_byte_type((p)[1], (p)[0])) | |
597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) | |
598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) | |
599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ | |
600 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) | |
601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |
602 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) | |
603 | |
604 #ifdef XML_MIN_SIZE | |
605 | |
606 static | |
607 int little2_byteType(const ENCODING *enc, const char *p) | |
608 { | |
609 return LITTLE2_BYTE_TYPE(enc, p); | |
610 } | |
611 | |
612 static | |
613 int little2_byteToAscii(const ENCODING *enc, const char *p) | |
614 { | |
615 return LITTLE2_BYTE_TO_ASCII(enc, p); | |
616 } | |
617 | |
618 static | |
619 int little2_charMatches(const ENCODING *enc, const char *p, int c) | |
620 { | |
621 return LITTLE2_CHAR_MATCHES(enc, p, c); | |
622 } | |
623 | |
624 static | |
625 int little2_isNameMin(const ENCODING *enc, const char *p) | |
626 { | |
627 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); | |
628 } | |
629 | |
630 static | |
631 int little2_isNmstrtMin(const ENCODING *enc, const char *p) | |
632 { | |
633 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); | |
634 } | |
635 | |
636 #undef VTABLE | |
637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 | |
638 | |
639 #else /* not XML_MIN_SIZE */ | |
640 | |
641 #undef PREFIX | |
642 #define PREFIX(ident) little2_ ## ident | |
643 #define MINBPC(enc) 2 | |
644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |
645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) | |
414
ec86d759ed54
Trailing whitespace cleanup
Mikael Berthe <mikael@lilotux.net>
parents:
237
diff
changeset
|
646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) |
25 | 647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) |
648 #define IS_NAME_CHAR(enc, p, n) 0 | |
649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) | |
650 #define IS_NMSTRT_CHAR(enc, p, n) (0) | |
651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) | |
652 | |
653 #include "xmltok_impl_c.h" | |
654 | |
655 #undef MINBPC | |
656 #undef BYTE_TYPE | |
657 #undef BYTE_TO_ASCII | |
658 #undef CHAR_MATCHES | |
659 #undef IS_NAME_CHAR | |
660 #undef IS_NAME_CHAR_MINBPC | |
661 #undef IS_NMSTRT_CHAR | |
662 #undef IS_NMSTRT_CHAR_MINBPC | |
663 #undef IS_INVALID_CHAR | |
664 | |
665 #endif /* not XML_MIN_SIZE */ | |
666 | |
667 #ifdef XML_NS | |
668 | |
669 static const struct normal_encoding little2_encoding_ns = { | |
670 { VTABLE, 2, 0, | |
671 #if XML_BYTE_ORDER == 12 | |
672 1 | |
673 #else | |
674 0 | |
675 #endif | |
676 }, | |
677 { | |
678 #include "asciitab.h" | |
679 #include "latin1tab.h" | |
680 }, | |
681 STANDARD_VTABLE(little2_) | |
682 }; | |
683 | |
684 #endif | |
685 | |
686 static const struct normal_encoding little2_encoding = { | |
687 { VTABLE, 2, 0, | |
688 #if XML_BYTE_ORDER == 12 | |
689 1 | |
690 #else | |
691 0 | |
692 #endif | |
693 }, | |
694 { | |
695 #define BT_COLON BT_NMSTRT | |
696 #include "asciitab.h" | |
697 #undef BT_COLON | |
698 #include "latin1tab.h" | |
699 }, | |
700 STANDARD_VTABLE(little2_) | |
701 }; | |
702 | |
703 #if XML_BYTE_ORDER != 21 | |
704 | |
705 #ifdef XML_NS | |
706 | |
707 static const struct normal_encoding internal_little2_encoding_ns = { | |
708 { VTABLE, 2, 0, 1 }, | |
709 { | |
710 #include "iasciitab.h" | |
711 #include "latin1tab.h" | |
712 }, | |
713 STANDARD_VTABLE(little2_) | |
714 }; | |
715 | |
716 #endif | |
717 | |
718 static const struct normal_encoding internal_little2_encoding = { | |
719 { VTABLE, 2, 0, 1 }, | |
720 { | |
721 #define BT_COLON BT_NMSTRT | |
722 #include "iasciitab.h" | |
723 #undef BT_COLON | |
724 #include "latin1tab.h" | |
725 }, | |
726 STANDARD_VTABLE(little2_) | |
727 }; | |
728 | |
729 #endif | |
730 | |
731 | |
732 #define BIG2_BYTE_TYPE(enc, p) \ | |
733 ((p)[0] == 0 \ | |
734 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ | |
735 : unicode_byte_type((p)[0], (p)[1])) | |
736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) | |
737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) | |
738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ | |
739 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) | |
740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |
741 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) | |
742 | |
743 #ifdef XML_MIN_SIZE | |
744 | |
745 static | |
746 int big2_byteType(const ENCODING *enc, const char *p) | |
747 { | |
748 return BIG2_BYTE_TYPE(enc, p); | |
749 } | |
750 | |
751 static | |
752 int big2_byteToAscii(const ENCODING *enc, const char *p) | |
753 { | |
754 return BIG2_BYTE_TO_ASCII(enc, p); | |
755 } | |
756 | |
757 static | |
758 int big2_charMatches(const ENCODING *enc, const char *p, int c) | |
759 { | |
760 return BIG2_CHAR_MATCHES(enc, p, c); | |
761 } | |
762 | |
763 static | |
764 int big2_isNameMin(const ENCODING *enc, const char *p) | |
765 { | |
766 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); | |
767 } | |
768 | |
769 static | |
770 int big2_isNmstrtMin(const ENCODING *enc, const char *p) | |
771 { | |
772 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); | |
773 } | |
774 | |
775 #undef VTABLE | |
776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 | |
777 | |
778 #else /* not XML_MIN_SIZE */ | |
779 | |
780 #undef PREFIX | |
781 #define PREFIX(ident) big2_ ## ident | |
782 #define MINBPC(enc) 2 | |
783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |
784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) | |
414
ec86d759ed54
Trailing whitespace cleanup
Mikael Berthe <mikael@lilotux.net>
parents:
237
diff
changeset
|
785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) |
25 | 786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) |
787 #define IS_NAME_CHAR(enc, p, n) 0 | |
788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) | |
789 #define IS_NMSTRT_CHAR(enc, p, n) (0) | |
790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) | |
791 | |
792 #include "xmltok_impl_c.h" | |
793 | |
794 #undef MINBPC | |
795 #undef BYTE_TYPE | |
796 #undef BYTE_TO_ASCII | |
797 #undef CHAR_MATCHES | |
798 #undef IS_NAME_CHAR | |
799 #undef IS_NAME_CHAR_MINBPC | |
800 #undef IS_NMSTRT_CHAR | |
801 #undef IS_NMSTRT_CHAR_MINBPC | |
802 #undef IS_INVALID_CHAR | |
803 | |
804 #endif /* not XML_MIN_SIZE */ | |
805 | |
806 #ifdef XML_NS | |
807 | |
808 static const struct normal_encoding big2_encoding_ns = { | |
809 { VTABLE, 2, 0, | |
810 #if XML_BYTE_ORDER == 21 | |
811 1 | |
812 #else | |
813 0 | |
814 #endif | |
815 }, | |
816 { | |
817 #include "asciitab.h" | |
818 #include "latin1tab.h" | |
819 }, | |
820 STANDARD_VTABLE(big2_) | |
821 }; | |
822 | |
823 #endif | |
824 | |
825 static const struct normal_encoding big2_encoding = { | |
826 { VTABLE, 2, 0, | |
827 #if XML_BYTE_ORDER == 21 | |
828 1 | |
829 #else | |
830 0 | |
831 #endif | |
832 }, | |
833 { | |
834 #define BT_COLON BT_NMSTRT | |
835 #include "asciitab.h" | |
836 #undef BT_COLON | |
837 #include "latin1tab.h" | |
838 }, | |
839 STANDARD_VTABLE(big2_) | |
840 }; | |
841 | |
842 #if XML_BYTE_ORDER != 12 | |
843 | |
844 #ifdef XML_NS | |
845 | |
846 static const struct normal_encoding internal_big2_encoding_ns = { | |
847 { VTABLE, 2, 0, 1 }, | |
848 { | |
849 #include "iasciitab.h" | |
850 #include "latin1tab.h" | |
851 }, | |
852 STANDARD_VTABLE(big2_) | |
853 }; | |
854 | |
855 #endif | |
856 | |
857 static const struct normal_encoding internal_big2_encoding = { | |
858 { VTABLE, 2, 0, 1 }, | |
859 { | |
860 #define BT_COLON BT_NMSTRT | |
861 #include "iasciitab.h" | |
862 #undef BT_COLON | |
863 #include "latin1tab.h" | |
864 }, | |
865 STANDARD_VTABLE(big2_) | |
866 }; | |
867 | |
868 #endif | |
869 | |
870 #undef PREFIX | |
871 | |
872 static | |
873 int streqci(const char *s1, const char *s2) | |
874 { | |
875 for (;;) { | |
876 char c1 = *s1++; | |
877 char c2 = *s2++; | |
878 if ('a' <= c1 && c1 <= 'z') | |
879 c1 += 'A' - 'a'; | |
880 if ('a' <= c2 && c2 <= 'z') | |
881 c2 += 'A' - 'a'; | |
882 if (c1 != c2) | |
883 return 0; | |
884 if (!c1) | |
885 break; | |
886 } | |
887 return 1; | |
888 } | |
889 | |
890 static | |
891 void initUpdatePosition(const ENCODING *enc, const char *ptr, | |
892 const char *end, POSITION *pos) | |
893 { | |
894 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); | |
895 } | |
896 | |
897 static | |
898 int toAscii(const ENCODING *enc, const char *ptr, const char *end) | |
899 { | |
900 char buf[1]; | |
901 char *p = buf; | |
902 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); | |
903 if (p == buf) | |
904 return -1; | |
905 else | |
906 return buf[0]; | |
907 } | |
908 | |
909 static | |
910 int isSpace(int c) | |
911 { | |
912 switch (c) { | |
913 case 0x20: | |
914 case 0xD: | |
915 case 0xA: | |
916 case 0x9: | |
917 return 1; | |
918 } | |
919 return 0; | |
920 } | |
921 | |
922 /* Return 1 if there's just optional white space | |
923 or there's an S followed by name=val. */ | |
924 static | |
925 int parsePseudoAttribute(const ENCODING *enc, | |
926 const char *ptr, | |
927 const char *end, | |
928 const char **namePtr, | |
929 const char **valPtr, | |
930 const char **nextTokPtr) | |
931 { | |
932 int c; | |
933 char open; | |
934 if (ptr == end) { | |
935 *namePtr = 0; | |
936 return 1; | |
937 } | |
938 if (!isSpace(toAscii(enc, ptr, end))) { | |
939 *nextTokPtr = ptr; | |
940 return 0; | |
941 } | |
942 do { | |
943 ptr += enc->minBytesPerChar; | |
944 } while (isSpace(toAscii(enc, ptr, end))); | |
945 if (ptr == end) { | |
946 *namePtr = 0; | |
947 return 1; | |
948 } | |
949 *namePtr = ptr; | |
950 for (;;) { | |
951 c = toAscii(enc, ptr, end); | |
952 if (c == -1) { | |
953 *nextTokPtr = ptr; | |
954 return 0; | |
955 } | |
956 if (c == '=') | |
957 break; | |
958 if (isSpace(c)) { | |
959 do { | |
960 ptr += enc->minBytesPerChar; | |
961 } while (isSpace(c = toAscii(enc, ptr, end))); | |
962 if (c != '=') { | |
963 *nextTokPtr = ptr; | |
964 return 0; | |
965 } | |
966 break; | |
967 } | |
968 ptr += enc->minBytesPerChar; | |
969 } | |
970 if (ptr == *namePtr) { | |
971 *nextTokPtr = ptr; | |
972 return 0; | |
973 } | |
974 ptr += enc->minBytesPerChar; | |
975 c = toAscii(enc, ptr, end); | |
976 while (isSpace(c)) { | |
977 ptr += enc->minBytesPerChar; | |
978 c = toAscii(enc, ptr, end); | |
979 } | |
980 if (c != '"' && c != '\'') { | |
981 *nextTokPtr = ptr; | |
982 return 0; | |
983 } | |
984 open = c; | |
985 ptr += enc->minBytesPerChar; | |
986 *valPtr = ptr; | |
987 for (;; ptr += enc->minBytesPerChar) { | |
988 c = toAscii(enc, ptr, end); | |
989 if (c == open) | |
990 break; | |
991 if (!('a' <= c && c <= 'z') | |
992 && !('A' <= c && c <= 'Z') | |
993 && !('0' <= c && c <= '9') | |
994 && c != '.' | |
995 && c != '-' | |
996 && c != '_') { | |
997 *nextTokPtr = ptr; | |
998 return 0; | |
999 } | |
1000 } | |
1001 *nextTokPtr = ptr + enc->minBytesPerChar; | |
1002 return 1; | |
1003 } | |
1004 | |
1005 static | |
1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, | |
1007 const char *, | |
1008 const char *), | |
1009 int isGeneralTextEntity, | |
1010 const ENCODING *enc, | |
1011 const char *ptr, | |
1012 const char *end, | |
1013 const char **badPtr, | |
1014 const char **versionPtr, | |
1015 const char **encodingName, | |
1016 const ENCODING **encoding, | |
1017 int *standalone) | |
1018 { | |
1019 const char *val = 0; | |
1020 const char *name = 0; | |
1021 ptr += 5 * enc->minBytesPerChar; | |
1022 end -= 2 * enc->minBytesPerChar; | |
1023 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) { | |
1024 *badPtr = ptr; | |
1025 return 0; | |
1026 } | |
1027 if (!XmlNameMatchesAscii(enc, name, "version")) { | |
1028 if (!isGeneralTextEntity) { | |
1029 *badPtr = name; | |
1030 return 0; | |
1031 } | |
1032 } | |
1033 else { | |
1034 if (versionPtr) | |
1035 *versionPtr = val; | |
1036 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { | |
1037 *badPtr = ptr; | |
1038 return 0; | |
1039 } | |
1040 if (!name) { | |
1041 if (isGeneralTextEntity) { | |
1042 /* a TextDecl must have an EncodingDecl */ | |
1043 *badPtr = ptr; | |
1044 return 0; | |
1045 } | |
1046 return 1; | |
1047 } | |
1048 } | |
1049 if (XmlNameMatchesAscii(enc, name, "encoding")) { | |
1050 int c = toAscii(enc, val, end); | |
1051 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) { | |
1052 *badPtr = val; | |
1053 return 0; | |
1054 } | |
1055 if (encodingName) | |
1056 *encodingName = val; | |
1057 if (encoding) | |
1058 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); | |
1059 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { | |
1060 *badPtr = ptr; | |
1061 return 0; | |
1062 } | |
1063 if (!name) | |
1064 return 1; | |
1065 } | |
1066 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) { | |
1067 *badPtr = name; | |
1068 return 0; | |
1069 } | |
1070 if (XmlNameMatchesAscii(enc, val, "yes")) { | |
1071 if (standalone) | |
1072 *standalone = 1; | |
1073 } | |
1074 else if (XmlNameMatchesAscii(enc, val, "no")) { | |
1075 if (standalone) | |
1076 *standalone = 0; | |
1077 } | |
1078 else { | |
1079 *badPtr = val; | |
1080 return 0; | |
1081 } | |
1082 while (isSpace(toAscii(enc, ptr, end))) | |
1083 ptr += enc->minBytesPerChar; | |
1084 if (ptr != end) { | |
1085 *badPtr = ptr; | |
1086 return 0; | |
1087 } | |
1088 return 1; | |
1089 } | |
1090 | |
1091 static | |
1092 int checkCharRefNumber(int result) | |
1093 { | |
1094 switch (result >> 8) { | |
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |
1097 return -1; | |
1098 case 0: | |
1099 if (latin1_encoding.type[result] == BT_NONXML) | |
1100 return -1; | |
1101 break; | |
1102 case 0xFF: | |
1103 if (result == 0xFFFE || result == 0xFFFF) | |
1104 return -1; | |
1105 break; | |
1106 } | |
1107 return result; | |
1108 } | |
1109 | |
1110 int XmlUtf8Encode(int c, char *buf) | |
1111 { | |
1112 enum { | |
1113 /* minN is minimum legal resulting value for N byte sequence */ | |
1114 min2 = 0x80, | |
1115 min3 = 0x800, | |
1116 min4 = 0x10000 | |
1117 }; | |
1118 | |
1119 if (c < 0) | |
1120 return 0; | |
1121 if (c < min2) { | |
1122 buf[0] = (c | UTF8_cval1); | |
1123 return 1; | |
1124 } | |
1125 if (c < min3) { | |
1126 buf[0] = ((c >> 6) | UTF8_cval2); | |
1127 buf[1] = ((c & 0x3f) | 0x80); | |
1128 return 2; | |
1129 } | |
1130 if (c < min4) { | |
1131 buf[0] = ((c >> 12) | UTF8_cval3); | |
1132 buf[1] = (((c >> 6) & 0x3f) | 0x80); | |
1133 buf[2] = ((c & 0x3f) | 0x80); | |
1134 return 3; | |
1135 } | |
1136 if (c < 0x110000) { | |
1137 buf[0] = ((c >> 18) | UTF8_cval4); | |
1138 buf[1] = (((c >> 12) & 0x3f) | 0x80); | |
1139 buf[2] = (((c >> 6) & 0x3f) | 0x80); | |
1140 buf[3] = ((c & 0x3f) | 0x80); | |
1141 return 4; | |
1142 } | |
1143 return 0; | |
1144 } | |
1145 | |
1146 int XmlUtf16Encode(int charNum, unsigned short *buf) | |
1147 { | |
1148 if (charNum < 0) | |
1149 return 0; | |
1150 if (charNum < 0x10000) { | |
1151 buf[0] = charNum; | |
1152 return 1; | |
1153 } | |
1154 if (charNum < 0x110000) { | |
1155 charNum -= 0x10000; | |
1156 buf[0] = (charNum >> 10) + 0xD800; | |
1157 buf[1] = (charNum & 0x3FF) + 0xDC00; | |
1158 return 2; | |
1159 } | |
1160 return 0; | |
1161 } | |
1162 | |
1163 struct unknown_encoding { | |
1164 struct normal_encoding normal; | |
1165 int (*convert)(void *userData, const char *p); | |
1166 void *userData; | |
1167 unsigned short utf16[256]; | |
1168 char utf8[256][4]; | |
1169 }; | |
1170 | |
1171 int XmlSizeOfUnknownEncoding() | |
1172 { | |
1173 return sizeof(struct unknown_encoding); | |
1174 } | |
1175 | |
1176 static | |
1177 int unknown_isName(const ENCODING *enc, const char *p) | |
1178 { | |
1179 int c = ((const struct unknown_encoding *)enc) | |
1180 ->convert(((const struct unknown_encoding *)enc)->userData, p); | |
1181 if (c & ~0xFFFF) | |
1182 return 0; | |
1183 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); | |
1184 } | |
1185 | |
1186 static | |
1187 int unknown_isNmstrt(const ENCODING *enc, const char *p) | |
1188 { | |
1189 int c = ((const struct unknown_encoding *)enc) | |
1190 ->convert(((const struct unknown_encoding *)enc)->userData, p); | |
1191 if (c & ~0xFFFF) | |
1192 return 0; | |
1193 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); | |
1194 } | |
1195 | |
1196 static | |
1197 int unknown_isInvalid(const ENCODING *enc, const char *p) | |
1198 { | |
1199 int c = ((const struct unknown_encoding *)enc) | |
1200 ->convert(((const struct unknown_encoding *)enc)->userData, p); | |
1201 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; | |
1202 } | |
1203 | |
1204 static | |
1205 void unknown_toUtf8(const ENCODING *enc, | |
1206 const char **fromP, const char *fromLim, | |
1207 char **toP, const char *toLim) | |
1208 { | |
1209 char buf[XML_UTF8_ENCODE_MAX]; | |
1210 for (;;) { | |
1211 const char *utf8; | |
1212 int n; | |
1213 if (*fromP == fromLim) | |
1214 break; | |
1215 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; | |
1216 n = *utf8++; | |
1217 if (n == 0) { | |
1218 int c = ((const struct unknown_encoding *)enc) | |
1219 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |
1220 n = XmlUtf8Encode(c, buf); | |
1221 if (n > toLim - *toP) | |
1222 break; | |
1223 utf8 = buf; | |
1224 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |
1225 - (BT_LEAD2 - 2); | |
1226 } | |
1227 else { | |
1228 if (n > toLim - *toP) | |
1229 break; | |
1230 (*fromP)++; | |
1231 } | |
1232 do { | |
1233 *(*toP)++ = *utf8++; | |
1234 } while (--n != 0); | |
1235 } | |
1236 } | |
1237 | |
1238 static | |
1239 void unknown_toUtf16(const ENCODING *enc, | |
1240 const char **fromP, const char *fromLim, | |
1241 unsigned short **toP, const unsigned short *toLim) | |
1242 { | |
1243 while (*fromP != fromLim && *toP != toLim) { | |
1244 unsigned short c | |
1245 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; | |
1246 if (c == 0) { | |
1247 c = (unsigned short)((const struct unknown_encoding *)enc) | |
1248 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |
1249 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |
1250 - (BT_LEAD2 - 2); | |
1251 } | |
1252 else | |
1253 (*fromP)++; | |
1254 *(*toP)++ = c; | |
1255 } | |
1256 } | |
1257 | |
1258 ENCODING * | |
1259 XmlInitUnknownEncoding(void *mem, | |
1260 int *table, | |
1261 int (*convert)(void *userData, const char *p), | |
1262 void *userData) | |
1263 { | |
1264 int i; | |
1265 struct unknown_encoding *e = mem; | |
883
0aa9015f06df
Remove some more libjabber warnings
Mikael Berthe <mikael@lilotux.net>
parents:
414
diff
changeset
|
1266 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) |
25 | 1267 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; |
1268 for (i = 0; i < 128; i++) | |
1269 if (latin1_encoding.type[i] != BT_OTHER | |
1270 && latin1_encoding.type[i] != BT_NONXML | |
1271 && table[i] != i) | |
1272 return 0; | |
1273 for (i = 0; i < 256; i++) { | |
1274 int c = table[i]; | |
1275 if (c == -1) { | |
1276 e->normal.type[i] = BT_MALFORM; | |
1277 /* This shouldn't really get used. */ | |
1278 e->utf16[i] = 0xFFFF; | |
1279 e->utf8[i][0] = 1; | |
1280 e->utf8[i][1] = 0; | |
1281 } | |
1282 else if (c < 0) { | |
1283 if (c < -4) | |
1284 return 0; | |
1285 e->normal.type[i] = BT_LEAD2 - (c + 2); | |
1286 e->utf8[i][0] = 0; | |
1287 e->utf16[i] = 0; | |
1288 } | |
1289 else if (c < 0x80) { | |
1290 if (latin1_encoding.type[c] != BT_OTHER | |
1291 && latin1_encoding.type[c] != BT_NONXML | |
1292 && c != i) | |
1293 return 0; | |
1294 e->normal.type[i] = latin1_encoding.type[c]; | |
1295 e->utf8[i][0] = 1; | |
1296 e->utf8[i][1] = (char)c; | |
1297 e->utf16[i] = c == 0 ? 0xFFFF : c; | |
1298 } | |
1299 else if (checkCharRefNumber(c) < 0) { | |
1300 e->normal.type[i] = BT_NONXML; | |
1301 /* This shouldn't really get used. */ | |
1302 e->utf16[i] = 0xFFFF; | |
1303 e->utf8[i][0] = 1; | |
1304 e->utf8[i][1] = 0; | |
1305 } | |
1306 else { | |
1307 if (c > 0xFFFF) | |
1308 return 0; | |
1309 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) | |
1310 e->normal.type[i] = BT_NMSTRT; | |
1311 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) | |
1312 e->normal.type[i] = BT_NAME; | |
1313 else | |
1314 e->normal.type[i] = BT_OTHER; | |
1315 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); | |
1316 e->utf16[i] = c; | |
1317 } | |
1318 } | |
1319 e->userData = userData; | |
1320 e->convert = convert; | |
1321 if (convert) { | |
1322 e->normal.isName2 = unknown_isName; | |
1323 e->normal.isName3 = unknown_isName; | |
1324 e->normal.isName4 = unknown_isName; | |
1325 e->normal.isNmstrt2 = unknown_isNmstrt; | |
1326 e->normal.isNmstrt3 = unknown_isNmstrt; | |
1327 e->normal.isNmstrt4 = unknown_isNmstrt; | |
1328 e->normal.isInvalid2 = unknown_isInvalid; | |
1329 e->normal.isInvalid3 = unknown_isInvalid; | |
1330 e->normal.isInvalid4 = unknown_isInvalid; | |
1331 } | |
1332 e->normal.enc.utf8Convert = unknown_toUtf8; | |
1333 e->normal.enc.utf16Convert = unknown_toUtf16; | |
1334 return &(e->normal.enc); | |
1335 } | |
1336 | |
1337 /* If this enumeration is changed, getEncodingIndex and encodings | |
1338 must also be changed. */ | |
1339 enum { | |
1340 UNKNOWN_ENC = -1, | |
1341 ISO_8859_1_ENC = 0, | |
1342 US_ASCII_ENC, | |
1343 UTF_8_ENC, | |
1344 UTF_16_ENC, | |
1345 UTF_16BE_ENC, | |
1346 UTF_16LE_ENC, | |
1347 /* must match encodingNames up to here */ | |
1348 NO_ENC | |
1349 }; | |
1350 | |
1351 static | |
1352 int getEncodingIndex(const char *name) | |
1353 { | |
1354 static const char *encodingNames[] = { | |
1355 "ISO-8859-1", | |
1356 "US-ASCII", | |
1357 "UTF-8", | |
1358 "UTF-16", | |
1359 "UTF-16BE" | |
1360 "UTF-16LE", | |
1361 }; | |
1362 int i; | |
1363 if (name == 0) | |
1364 return NO_ENC; | |
883
0aa9015f06df
Remove some more libjabber warnings
Mikael Berthe <mikael@lilotux.net>
parents:
414
diff
changeset
|
1365 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) |
25 | 1366 if (streqci(name, encodingNames[i])) |
1367 return i; | |
1368 return UNKNOWN_ENC; | |
1369 } | |
1370 | |
1371 /* For binary compatibility, we store the index of the encoding specified | |
1372 at initialization in the isUtf16 member. */ | |
1373 | |
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16) | |
1375 | |
1376 /* This is what detects the encoding. | |
1377 encodingTable maps from encoding indices to encodings; | |
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding; | |
1379 state is XML_CONTENT_STATE if we're parsing an external text entity, | |
1380 and XML_PROLOG_STATE otherwise. | |
1381 */ | |
1382 | |
1383 | |
1384 static | |
1385 int initScan(const ENCODING **encodingTable, | |
1386 const INIT_ENCODING *enc, | |
1387 int state, | |
1388 const char *ptr, | |
1389 const char *end, | |
1390 const char **nextTokPtr) | |
1391 { | |
1392 const ENCODING **encPtr; | |
1393 | |
1394 if (ptr == end) | |
1395 return XML_TOK_NONE; | |
1396 encPtr = enc->encPtr; | |
1397 if (ptr + 1 == end) { | |
1398 /* only a single byte available for auto-detection */ | |
1399 /* a well-formed document entity must have more than one byte */ | |
1400 if (state != XML_CONTENT_STATE) | |
1401 return XML_TOK_PARTIAL; | |
1402 /* so we're parsing an external text entity... */ | |
1403 /* if UTF-16 was externally specified, then we need at least 2 bytes */ | |
1404 switch (INIT_ENC_INDEX(enc)) { | |
1405 case UTF_16_ENC: | |
1406 case UTF_16LE_ENC: | |
1407 case UTF_16BE_ENC: | |
1408 return XML_TOK_PARTIAL; | |
1409 } | |
1410 switch ((unsigned char)*ptr) { | |
1411 case 0xFE: | |
1412 case 0xFF: | |
1413 case 0xEF: /* possibly first byte of UTF-8 BOM */ | |
1414 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC | |
1415 && state == XML_CONTENT_STATE) | |
1416 break; | |
1417 /* fall through */ | |
1418 case 0x00: | |
1419 case 0x3C: | |
1420 return XML_TOK_PARTIAL; | |
1421 } | |
1422 } | |
1423 else { | |
1424 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { | |
1425 case 0xFEFF: | |
1426 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC | |
1427 && state == XML_CONTENT_STATE) | |
1428 break; | |
1429 *nextTokPtr = ptr + 2; | |
1430 *encPtr = encodingTable[UTF_16BE_ENC]; | |
1431 return XML_TOK_BOM; | |
1432 /* 00 3C is handled in the default case */ | |
1433 case 0x3C00: | |
1434 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC | |
1435 || INIT_ENC_INDEX(enc) == UTF_16_ENC) | |
1436 && state == XML_CONTENT_STATE) | |
1437 break; | |
1438 *encPtr = encodingTable[UTF_16LE_ENC]; | |
1439 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1440 case 0xFFFE: | |
1441 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC | |
1442 && state == XML_CONTENT_STATE) | |
1443 break; | |
1444 *nextTokPtr = ptr + 2; | |
1445 *encPtr = encodingTable[UTF_16LE_ENC]; | |
1446 return XML_TOK_BOM; | |
1447 case 0xEFBB: | |
1448 /* Maybe a UTF-8 BOM (EF BB BF) */ | |
1449 /* If there's an explicitly specified (external) encoding | |
1450 of ISO-8859-1 or some flavour of UTF-16 | |
1451 and this is an external text entity, | |
1452 don't look for the BOM, | |
1453 because it might be a legal data. */ | |
1454 if (state == XML_CONTENT_STATE) { | |
1455 int e = INIT_ENC_INDEX(enc); | |
1456 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) | |
1457 break; | |
1458 } | |
1459 if (ptr + 2 == end) | |
1460 return XML_TOK_PARTIAL; | |
1461 if ((unsigned char)ptr[2] == 0xBF) { | |
1462 *encPtr = encodingTable[UTF_8_ENC]; | |
1463 return XML_TOK_BOM; | |
1464 } | |
1465 break; | |
1466 default: | |
1467 if (ptr[0] == '\0') { | |
1468 /* 0 isn't a legal data character. Furthermore a document entity can only | |
1469 start with ASCII characters. So the only way this can fail to be big-endian | |
1470 UTF-16 if it it's an external parsed general entity that's labelled as | |
1471 UTF-16LE. */ | |
1472 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) | |
1473 break; | |
1474 *encPtr = encodingTable[UTF_16BE_ENC]; | |
1475 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1476 } | |
1477 else if (ptr[1] == '\0') { | |
1478 /* We could recover here in the case: | |
1479 - parsing an external entity | |
1480 - second byte is 0 | |
1481 - no externally specified encoding | |
1482 - no encoding declaration | |
1483 by assuming UTF-16LE. But we don't, because this would mean when | |
1484 presented just with a single byte, we couldn't reliably determine | |
1485 whether we needed further bytes. */ | |
1486 if (state == XML_CONTENT_STATE) | |
1487 break; | |
1488 *encPtr = encodingTable[UTF_16LE_ENC]; | |
1489 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); | |
1490 } | |
1491 break; | |
1492 } | |
1493 } | |
237 | 1494 *encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)]; |
25 | 1495 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); |
1496 } | |
1497 | |
1498 | |
1499 #define NS(x) x | |
1500 #define ns(x) x | |
1501 #include "xmltok_ns_c.h" | |
1502 #undef NS | |
1503 #undef ns | |
1504 | |
1505 #ifdef XML_NS | |
1506 | |
1507 #define NS(x) x ## NS | |
1508 #define ns(x) x ## _ns | |
1509 | |
1510 #include "xmltok_ns_c.h" | |
1511 | |
1512 #undef NS | |
1513 #undef ns | |
1514 | |
1515 ENCODING * | |
1516 XmlInitUnknownEncodingNS(void *mem, | |
1517 int *table, | |
1518 int (*convert)(void *userData, const char *p), | |
1519 void *userData) | |
1520 { | |
1521 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); | |
1522 if (enc) | |
1523 ((struct normal_encoding *)enc)->type[':'] = BT_COLON; | |
1524 return enc; | |
1525 } | |
1526 | |
1527 #endif /* XML_NS */ |