1/*
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000-2017 Expat development team
11   Licensed under the MIT license:
12
13   Permission is  hereby granted,  free of charge,  to any  person obtaining
14   a  copy  of  this  software   and  associated  documentation  files  (the
15   "Software"),  to  deal in  the  Software  without restriction,  including
16   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17   distribute, sublicense, and/or sell copies of the Software, and to permit
18   persons  to whom  the Software  is  furnished to  do so,  subject to  the
19   following conditions:
20
21   The above copyright  notice and this permission notice  shall be included
22   in all copies or substantial portions of the Software.
23
24   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30   USE OR OTHER DEALINGS IN THE SOFTWARE.
31*/
32
33#include <stddef.h>
34#include <string.h>  /* memcpy */
35
36#if defined(_MSC_VER) && (_MSC_VER <= 1700)
37  /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
38# define bool   int
39# define false  0
40# define true   1
41#else
42# include <stdbool.h>
43#endif
44
45
46#ifdef _WIN32
47#include "winconfig.h"
48#else
49#ifdef HAVE_EXPAT_CONFIG_H
50#include <expat_config.h>
51#endif
52#endif /* ndef _WIN32 */
53
54#include "expat_external.h"
55#include "internal.h"
56#include "xmltok.h"
57#include "nametab.h"
58
59#ifdef XML_DTD
60#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
61#else
62#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
63#endif
64
65#define VTABLE1 \
66  { PREFIX(prologTok), PREFIX(contentTok), \
67    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
68  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
69  PREFIX(nameMatchesAscii), \
70  PREFIX(nameLength), \
71  PREFIX(skipS), \
72  PREFIX(getAtts), \
73  PREFIX(charRefNumber), \
74  PREFIX(predefinedEntityName), \
75  PREFIX(updatePosition), \
76  PREFIX(isPublicId)
77
78#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79
80#define UCS2_GET_NAMING(pages, hi, lo) \
81   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82
83/* A 2 byte UTF-8 representation splits the characters 11 bits between
84   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
85   pages, 3 bits to add to that index and 5 bits to generate the mask.
86*/
87#define UTF8_GET_NAMING2(pages, byte) \
88    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
89                      + ((((byte)[0]) & 3) << 1) \
90                      + ((((byte)[1]) >> 5) & 1)] \
91         & (1u << (((byte)[1]) & 0x1F)))
92
93/* A 3 byte UTF-8 representation splits the characters 16 bits between
94   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
95   into pages, 3 bits to add to that index and 5 bits to generate the
96   mask.
97*/
98#define UTF8_GET_NAMING3(pages, byte) \
99  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
100                             + ((((byte)[1]) >> 2) & 0xF)] \
101                       << 3) \
102                      + ((((byte)[1]) & 3) << 1) \
103                      + ((((byte)[2]) >> 5) & 1)] \
104         & (1u << (((byte)[2]) & 0x1F)))
105
106#define UTF8_GET_NAMING(pages, p, n) \
107  ((n) == 2 \
108  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
109  : ((n) == 3 \
110     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
111     : 0))
112
113/* Detection of invalid UTF-8 sequences is based on Table 3.1B
114   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
115   with the additional restriction of not allowing the Unicode
116   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
117   Implementation details:
118     (A & 0x80) == 0     means A < 0x80
119   and
120     (A & 0xC0) == 0xC0  means A > 0xBF
121*/
122
123#define UTF8_INVALID2(p) \
124  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
125
126#define UTF8_INVALID3(p) \
127  (((p)[2] & 0x80) == 0 \
128  || \
129  ((*p) == 0xEF && (p)[1] == 0xBF \
130    ? \
131    (p)[2] > 0xBD \
132    : \
133    ((p)[2] & 0xC0) == 0xC0) \
134  || \
135  ((*p) == 0xE0 \
136    ? \
137    (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
138    : \
139    ((p)[1] & 0x80) == 0 \
140    || \
141    ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
142
143#define UTF8_INVALID4(p) \
144  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
145  || \
146  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
147  || \
148  ((*p) == 0xF0 \
149    ? \
150    (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
151    : \
152    ((p)[1] & 0x80) == 0 \
153    || \
154    ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
155
156static int PTRFASTCALL
157isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
158{
159  return 0;
160}
161
162static int PTRFASTCALL
163utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
164{
165  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
166}
167
168static int PTRFASTCALL
169utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
170{
171  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
172}
173
174#define utf8_isName4 isNever
175
176static int PTRFASTCALL
177utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
178{
179  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
180}
181
182static int PTRFASTCALL
183utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
184{
185  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
186}
187
188#define utf8_isNmstrt4 isNever
189
190static int PTRFASTCALL
191utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
192{
193  return UTF8_INVALID2((const unsigned char *)p);
194}
195
196static int PTRFASTCALL
197utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
198{
199  return UTF8_INVALID3((const unsigned char *)p);
200}
201
202static int PTRFASTCALL
203utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
204{
205  return UTF8_INVALID4((const unsigned char *)p);
206}
207
208struct normal_encoding {
209  ENCODING enc;
210  unsigned char type[256];
211#ifdef XML_MIN_SIZE
212  int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
213  int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
214  int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
215  int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
216  int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
217#endif /* XML_MIN_SIZE */
218  int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
219  int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
220  int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
221  int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
222  int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
223  int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
224  int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
225  int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
226  int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
227};
228
229#define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
230
231#ifdef XML_MIN_SIZE
232
233#define STANDARD_VTABLE(E) \
234 E ## byteType, \
235 E ## isNameMin, \
236 E ## isNmstrtMin, \
237 E ## byteToAscii, \
238 E ## charMatches,
239
240#else
241
242#define STANDARD_VTABLE(E) /* as nothing */
243
244#endif
245
246#define NORMAL_VTABLE(E) \
247 E ## isName2, \
248 E ## isName3, \
249 E ## isName4, \
250 E ## isNmstrt2, \
251 E ## isNmstrt3, \
252 E ## isNmstrt4, \
253 E ## isInvalid2, \
254 E ## isInvalid3, \
255 E ## isInvalid4
256
257#define NULL_VTABLE \
258 /* isName2 */ NULL, \
259 /* isName3 */ NULL, \
260 /* isName4 */ NULL, \
261 /* isNmstrt2 */ NULL, \
262 /* isNmstrt3 */ NULL, \
263 /* isNmstrt4 */ NULL, \
264 /* isInvalid2 */ NULL, \
265 /* isInvalid3 */ NULL, \
266 /* isInvalid4 */ NULL
267
268static int FASTCALL checkCharRefNumber(int);
269
270#include "xmltok_impl.h"
271#include "ascii.h"
272
273#ifdef XML_MIN_SIZE
274#define sb_isNameMin isNever
275#define sb_isNmstrtMin isNever
276#endif
277
278#ifdef XML_MIN_SIZE
279#define MINBPC(enc) ((enc)->minBytesPerChar)
280#else
281/* minimum bytes per character */
282#define MINBPC(enc) 1
283#endif
284
285#define SB_BYTE_TYPE(enc, p) \
286  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
287
288#ifdef XML_MIN_SIZE
289static int PTRFASTCALL
290sb_byteType(const ENCODING *enc, const char *p)
291{
292  return SB_BYTE_TYPE(enc, p);
293}
294#define BYTE_TYPE(enc, p) \
295 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
296#else
297#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
298#endif
299
300#ifdef XML_MIN_SIZE
301#define BYTE_TO_ASCII(enc, p) \
302 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
303static int PTRFASTCALL
304sb_byteToAscii(const ENCODING *enc, const char *p)
305{
306  return *p;
307}
308#else
309#define BYTE_TO_ASCII(enc, p) (*(p))
310#endif
311
312#define IS_NAME_CHAR(enc, p, n) \
313 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
314#define IS_NMSTRT_CHAR(enc, p, n) \
315 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
316#define IS_INVALID_CHAR(enc, p, n) \
317 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
318
319#ifdef XML_MIN_SIZE
320#define IS_NAME_CHAR_MINBPC(enc, p) \
321 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
322#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
323 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
324#else
325#define IS_NAME_CHAR_MINBPC(enc, p) (0)
326#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
327#endif
328
329#ifdef XML_MIN_SIZE
330#define CHAR_MATCHES(enc, p, c) \
331 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
332static int PTRCALL
333sb_charMatches(const ENCODING *enc, const char *p, int c)
334{
335  return *p == c;
336}
337#else
338/* c is an ASCII character */
339#define CHAR_MATCHES(enc, p, c) (*(p) == c)
340#endif
341
342#define PREFIX(ident) normal_ ## ident
343#define XML_TOK_IMPL_C
344#include "xmltok_impl.c"
345#undef XML_TOK_IMPL_C
346
347#undef MINBPC
348#undef BYTE_TYPE
349#undef BYTE_TO_ASCII
350#undef CHAR_MATCHES
351#undef IS_NAME_CHAR
352#undef IS_NAME_CHAR_MINBPC
353#undef IS_NMSTRT_CHAR
354#undef IS_NMSTRT_CHAR_MINBPC
355#undef IS_INVALID_CHAR
356
357enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
358  UTF8_cval1 = 0x00,
359  UTF8_cval2 = 0xc0,
360  UTF8_cval3 = 0xe0,
361  UTF8_cval4 = 0xf0
362};
363
364void
365_INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef)
366{
367  const char * fromLim = *fromLimRef;
368  size_t walked = 0;
369  for (; fromLim > from; fromLim--, walked++) {
370    const unsigned char prev = (unsigned char)fromLim[-1];
371    if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
372      if (walked + 1 >= 4) {
373        fromLim += 4 - 1;
374        break;
375      } else {
376        walked = 0;
377      }
378    } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
379      if (walked + 1 >= 3) {
380        fromLim += 3 - 1;
381        break;
382      } else {
383        walked = 0;
384      }
385    } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
386      if (walked + 1 >= 2) {
387        fromLim += 2 - 1;
388        break;
389      } else {
390        walked = 0;
391      }
392    } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
393      break;
394    }
395  }
396  *fromLimRef = fromLim;
397}
398
399static enum XML_Convert_Result PTRCALL
400utf8_toUtf8(const ENCODING *UNUSED_P(enc),
401            const char **fromP, const char *fromLim,
402            char **toP, const char *toLim)
403{
404  bool input_incomplete = false;
405  bool output_exhausted = false;
406
407  /* Avoid copying partial characters (due to limited space). */
408  const ptrdiff_t bytesAvailable = fromLim - *fromP;
409  const ptrdiff_t bytesStorable = toLim - *toP;
410  if (bytesAvailable > bytesStorable) {
411    fromLim = *fromP + bytesStorable;
412    output_exhausted = true;
413  }
414
415  /* Avoid copying partial characters (from incomplete input). */
416  {
417    const char * const fromLimBefore = fromLim;
418    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
419    if (fromLim < fromLimBefore) {
420      input_incomplete = true;
421    }
422  }
423
424  {
425    const ptrdiff_t bytesToCopy = fromLim - *fromP;
426    memcpy(*toP, *fromP, bytesToCopy);
427    *fromP += bytesToCopy;
428    *toP += bytesToCopy;
429  }
430
431  if (output_exhausted)  /* needs to go first */
432    return XML_CONVERT_OUTPUT_EXHAUSTED;
433  else if (input_incomplete)
434    return XML_CONVERT_INPUT_INCOMPLETE;
435  else
436    return XML_CONVERT_COMPLETED;
437}
438
439static enum XML_Convert_Result PTRCALL
440utf8_toUtf16(const ENCODING *enc,
441             const char **fromP, const char *fromLim,
442             unsigned short **toP, const unsigned short *toLim)
443{
444  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
445  unsigned short *to = *toP;
446  const char *from = *fromP;
447  while (from < fromLim && to < toLim) {
448    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
449    case BT_LEAD2:
450      if (fromLim - from < 2) {
451        res = XML_CONVERT_INPUT_INCOMPLETE;
452        goto after;
453      }
454      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
455      from += 2;
456      break;
457    case BT_LEAD3:
458      if (fromLim - from < 3) {
459        res = XML_CONVERT_INPUT_INCOMPLETE;
460        goto after;
461      }
462      *to++ = (unsigned short)(((from[0] & 0xf) << 12)
463                               | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
464      from += 3;
465      break;
466    case BT_LEAD4:
467      {
468        unsigned long n;
469        if (toLim - to < 2) {
470          res = XML_CONVERT_OUTPUT_EXHAUSTED;
471          goto after;
472        }
473        if (fromLim - from < 4) {
474          res = XML_CONVERT_INPUT_INCOMPLETE;
475          goto after;
476        }
477        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
478            | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
479        n -= 0x10000;
480        to[0] = (unsigned short)((n >> 10) | 0xD800);
481        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
482        to += 2;
483        from += 4;
484      }
485      break;
486    default:
487      *to++ = *from++;
488      break;
489    }
490  }
491  if (from < fromLim)
492    res = XML_CONVERT_OUTPUT_EXHAUSTED;
493after:
494  *fromP = from;
495  *toP = to;
496  return res;
497}
498
499#ifdef XML_NS
500static const struct normal_encoding utf8_encoding_ns = {
501  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
502  {
503#include "asciitab.h"
504#include "utf8tab.h"
505  },
506  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
507};
508#endif
509
510static const struct normal_encoding utf8_encoding = {
511  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
512  {
513#define BT_COLON BT_NMSTRT
514#include "asciitab.h"
515#undef BT_COLON
516#include "utf8tab.h"
517  },
518  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
519};
520
521#ifdef XML_NS
522
523static const struct normal_encoding internal_utf8_encoding_ns = {
524  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
525  {
526#include "iasciitab.h"
527#include "utf8tab.h"
528  },
529  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
530};
531
532#endif
533
534static const struct normal_encoding internal_utf8_encoding = {
535  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
536  {
537#define BT_COLON BT_NMSTRT
538#include "iasciitab.h"
539#undef BT_COLON
540#include "utf8tab.h"
541  },
542  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
543};
544
545static enum XML_Convert_Result PTRCALL
546latin1_toUtf8(const ENCODING *UNUSED_P(enc),
547              const char **fromP, const char *fromLim,
548              char **toP, const char *toLim)
549{
550  for (;;) {
551    unsigned char c;
552    if (*fromP == fromLim)
553      return XML_CONVERT_COMPLETED;
554    c = (unsigned char)**fromP;
555    if (c & 0x80) {
556      if (toLim - *toP < 2)
557        return XML_CONVERT_OUTPUT_EXHAUSTED;
558      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
559      *(*toP)++ = (char)((c & 0x3f) | 0x80);
560      (*fromP)++;
561    }
562    else {
563      if (*toP == toLim)
564        return XML_CONVERT_OUTPUT_EXHAUSTED;
565      *(*toP)++ = *(*fromP)++;
566    }
567  }
568}
569
570static enum XML_Convert_Result PTRCALL
571latin1_toUtf16(const ENCODING *UNUSED_P(enc),
572               const char **fromP, const char *fromLim,
573               unsigned short **toP, const unsigned short *toLim)
574{
575  while (*fromP < fromLim && *toP < toLim)
576    *(*toP)++ = (unsigned char)*(*fromP)++;
577
578  if ((*toP == toLim) && (*fromP < fromLim))
579    return XML_CONVERT_OUTPUT_EXHAUSTED;
580  else
581    return XML_CONVERT_COMPLETED;
582}
583
584#ifdef XML_NS
585
586static const struct normal_encoding latin1_encoding_ns = {
587  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
588  {
589#include "asciitab.h"
590#include "latin1tab.h"
591  },
592  STANDARD_VTABLE(sb_) NULL_VTABLE
593};
594
595#endif
596
597static const struct normal_encoding latin1_encoding = {
598  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
599  {
600#define BT_COLON BT_NMSTRT
601#include "asciitab.h"
602#undef BT_COLON
603#include "latin1tab.h"
604  },
605  STANDARD_VTABLE(sb_) NULL_VTABLE
606};
607
608static enum XML_Convert_Result PTRCALL
609ascii_toUtf8(const ENCODING *UNUSED_P(enc),
610             const char **fromP, const char *fromLim,
611             char **toP, const char *toLim)
612{
613  while (*fromP < fromLim && *toP < toLim)
614    *(*toP)++ = *(*fromP)++;
615
616  if ((*toP == toLim) && (*fromP < fromLim))
617    return XML_CONVERT_OUTPUT_EXHAUSTED;
618  else
619    return XML_CONVERT_COMPLETED;
620}
621
622#ifdef XML_NS
623
624static const struct normal_encoding ascii_encoding_ns = {
625  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
626  {
627#include "asciitab.h"
628/* BT_NONXML == 0 */
629  },
630  STANDARD_VTABLE(sb_) NULL_VTABLE
631};
632
633#endif
634
635static const struct normal_encoding ascii_encoding = {
636  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
637  {
638#define BT_COLON BT_NMSTRT
639#include "asciitab.h"
640#undef BT_COLON
641/* BT_NONXML == 0 */
642  },
643  STANDARD_VTABLE(sb_) NULL_VTABLE
644};
645
646static int PTRFASTCALL
647unicode_byte_type(char hi, char lo)
648{
649  switch ((unsigned char)hi) {
650  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
651    return BT_LEAD4;
652  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
653    return BT_TRAIL;
654  case 0xFF:
655    switch ((unsigned char)lo) {
656    case 0xFF:
657    case 0xFE:
658      return BT_NONXML;
659    }
660    break;
661  }
662  return BT_NONASCII;
663}
664
665#define DEFINE_UTF16_TO_UTF8(E) \
666static enum XML_Convert_Result  PTRCALL \
667E ## toUtf8(const ENCODING *UNUSED_P(enc), \
668            const char **fromP, const char *fromLim, \
669            char **toP, const char *toLim) \
670{ \
671  const char *from = *fromP; \
672  fromLim = from + (((fromLim - from) >> 1) << 1);  /* shrink to even */ \
673  for (; from < fromLim; from += 2) { \
674    int plane; \
675    unsigned char lo2; \
676    unsigned char lo = GET_LO(from); \
677    unsigned char hi = GET_HI(from); \
678    switch (hi) { \
679    case 0: \
680      if (lo < 0x80) { \
681        if (*toP == toLim) { \
682          *fromP = from; \
683          return XML_CONVERT_OUTPUT_EXHAUSTED; \
684        } \
685        *(*toP)++ = lo; \
686        break; \
687      } \
688      /* fall through */ \
689    case 0x1: case 0x2: case 0x3: \
690    case 0x4: case 0x5: case 0x6: case 0x7: \
691      if (toLim -  *toP < 2) { \
692        *fromP = from; \
693        return XML_CONVERT_OUTPUT_EXHAUSTED; \
694      } \
695      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
696      *(*toP)++ = ((lo & 0x3f) | 0x80); \
697      break; \
698    default: \
699      if (toLim -  *toP < 3)  { \
700        *fromP = from; \
701        return XML_CONVERT_OUTPUT_EXHAUSTED; \
702      } \
703      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
704      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
705      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
706      *(*toP)++ = ((lo & 0x3f) | 0x80); \
707      break; \
708    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
709      if (toLim -  *toP < 4) { \
710        *fromP = from; \
711        return XML_CONVERT_OUTPUT_EXHAUSTED; \
712      } \
713      if (fromLim - from < 4) { \
714        *fromP = from; \
715        return XML_CONVERT_INPUT_INCOMPLETE; \
716      } \
717      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
718      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
719      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
720      from += 2; \
721      lo2 = GET_LO(from); \
722      *(*toP)++ = (((lo & 0x3) << 4) \
723                   | ((GET_HI(from) & 0x3) << 2) \
724                   | (lo2 >> 6) \
725                   | 0x80); \
726      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
727      break; \
728    } \
729  } \
730  *fromP = from; \
731  if (from < fromLim) \
732    return XML_CONVERT_INPUT_INCOMPLETE; \
733  else \
734    return XML_CONVERT_COMPLETED; \
735}
736
737#define DEFINE_UTF16_TO_UTF16(E) \
738static enum XML_Convert_Result  PTRCALL \
739E ## toUtf16(const ENCODING *UNUSED_P(enc), \
740             const char **fromP, const char *fromLim, \
741             unsigned short **toP, const unsigned short *toLim) \
742{ \
743  enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
744  fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1);  /* shrink to even */ \
745  /* Avoid copying first half only of surrogate */ \
746  if (fromLim - *fromP > ((toLim - *toP) << 1) \
747      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
748    fromLim -= 2; \
749    res = XML_CONVERT_INPUT_INCOMPLETE; \
750  } \
751  for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
752    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
753  if ((*toP == toLim) && (*fromP < fromLim)) \
754    return XML_CONVERT_OUTPUT_EXHAUSTED; \
755  else \
756    return res; \
757}
758
759#define SET2(ptr, ch) \
760  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
761#define GET_LO(ptr) ((unsigned char)(ptr)[0])
762#define GET_HI(ptr) ((unsigned char)(ptr)[1])
763
764DEFINE_UTF16_TO_UTF8(little2_)
765DEFINE_UTF16_TO_UTF16(little2_)
766
767#undef SET2
768#undef GET_LO
769#undef GET_HI
770
771#define SET2(ptr, ch) \
772  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
773#define GET_LO(ptr) ((unsigned char)(ptr)[1])
774#define GET_HI(ptr) ((unsigned char)(ptr)[0])
775
776DEFINE_UTF16_TO_UTF8(big2_)
777DEFINE_UTF16_TO_UTF16(big2_)
778
779#undef SET2
780#undef GET_LO
781#undef GET_HI
782
783#define LITTLE2_BYTE_TYPE(enc, p) \
784 ((p)[1] == 0 \
785  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
786  : unicode_byte_type((p)[1], (p)[0]))
787#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
788#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
789#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
790  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
791#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
792  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
793
794#ifdef XML_MIN_SIZE
795
796static int PTRFASTCALL
797little2_byteType(const ENCODING *enc, const char *p)
798{
799  return LITTLE2_BYTE_TYPE(enc, p);
800}
801
802static int PTRFASTCALL
803little2_byteToAscii(const ENCODING *enc, const char *p)
804{
805  return LITTLE2_BYTE_TO_ASCII(enc, p);
806}
807
808static int PTRCALL
809little2_charMatches(const ENCODING *enc, const char *p, int c)
810{
811  return LITTLE2_CHAR_MATCHES(enc, p, c);
812}
813
814static int PTRFASTCALL
815little2_isNameMin(const ENCODING *enc, const char *p)
816{
817  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
818}
819
820static int PTRFASTCALL
821little2_isNmstrtMin(const ENCODING *enc, const char *p)
822{
823  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
824}
825
826#undef VTABLE
827#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
828
829#else /* not XML_MIN_SIZE */
830
831#undef PREFIX
832#define PREFIX(ident) little2_ ## ident
833#define MINBPC(enc) 2
834/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
835#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
836#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
837#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
838#define IS_NAME_CHAR(enc, p, n) 0
839#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
840#define IS_NMSTRT_CHAR(enc, p, n) (0)
841#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
842
843#define XML_TOK_IMPL_C
844#include "xmltok_impl.c"
845#undef XML_TOK_IMPL_C
846
847#undef MINBPC
848#undef BYTE_TYPE
849#undef BYTE_TO_ASCII
850#undef CHAR_MATCHES
851#undef IS_NAME_CHAR
852#undef IS_NAME_CHAR_MINBPC
853#undef IS_NMSTRT_CHAR
854#undef IS_NMSTRT_CHAR_MINBPC
855#undef IS_INVALID_CHAR
856
857#endif /* not XML_MIN_SIZE */
858
859#ifdef XML_NS
860
861static const struct normal_encoding little2_encoding_ns = {
862  { VTABLE, 2, 0,
863#if BYTEORDER == 1234
864    1
865#else
866    0
867#endif
868  },
869  {
870#include "asciitab.h"
871#include "latin1tab.h"
872  },
873  STANDARD_VTABLE(little2_) NULL_VTABLE
874};
875
876#endif
877
878static const struct normal_encoding little2_encoding = {
879  { VTABLE, 2, 0,
880#if BYTEORDER == 1234
881    1
882#else
883    0
884#endif
885  },
886  {
887#define BT_COLON BT_NMSTRT
888#include "asciitab.h"
889#undef BT_COLON
890#include "latin1tab.h"
891  },
892  STANDARD_VTABLE(little2_) NULL_VTABLE
893};
894
895#if BYTEORDER != 4321
896
897#ifdef XML_NS
898
899static const struct normal_encoding internal_little2_encoding_ns = {
900  { VTABLE, 2, 0, 1 },
901  {
902#include "iasciitab.h"
903#include "latin1tab.h"
904  },
905  STANDARD_VTABLE(little2_) NULL_VTABLE
906};
907
908#endif
909
910static const struct normal_encoding internal_little2_encoding = {
911  { VTABLE, 2, 0, 1 },
912  {
913#define BT_COLON BT_NMSTRT
914#include "iasciitab.h"
915#undef BT_COLON
916#include "latin1tab.h"
917  },
918  STANDARD_VTABLE(little2_) NULL_VTABLE
919};
920
921#endif
922
923
924#define BIG2_BYTE_TYPE(enc, p) \
925 ((p)[0] == 0 \
926  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
927  : unicode_byte_type((p)[0], (p)[1]))
928#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
929#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
930#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
931  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
932#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
933  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
934
935#ifdef XML_MIN_SIZE
936
937static int PTRFASTCALL
938big2_byteType(const ENCODING *enc, const char *p)
939{
940  return BIG2_BYTE_TYPE(enc, p);
941}
942
943static int PTRFASTCALL
944big2_byteToAscii(const ENCODING *enc, const char *p)
945{
946  return BIG2_BYTE_TO_ASCII(enc, p);
947}
948
949static int PTRCALL
950big2_charMatches(const ENCODING *enc, const char *p, int c)
951{
952  return BIG2_CHAR_MATCHES(enc, p, c);
953}
954
955static int PTRFASTCALL
956big2_isNameMin(const ENCODING *enc, const char *p)
957{
958  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
959}
960
961static int PTRFASTCALL
962big2_isNmstrtMin(const ENCODING *enc, const char *p)
963{
964  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
965}
966
967#undef VTABLE
968#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
969
970#else /* not XML_MIN_SIZE */
971
972#undef PREFIX
973#define PREFIX(ident) big2_ ## ident
974#define MINBPC(enc) 2
975/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
976#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
977#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
978#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
979#define IS_NAME_CHAR(enc, p, n) 0
980#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
981#define IS_NMSTRT_CHAR(enc, p, n) (0)
982#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
983
984#define XML_TOK_IMPL_C
985#include "xmltok_impl.c"
986#undef XML_TOK_IMPL_C
987
988#undef MINBPC
989#undef BYTE_TYPE
990#undef BYTE_TO_ASCII
991#undef CHAR_MATCHES
992#undef IS_NAME_CHAR
993#undef IS_NAME_CHAR_MINBPC
994#undef IS_NMSTRT_CHAR
995#undef IS_NMSTRT_CHAR_MINBPC
996#undef IS_INVALID_CHAR
997
998#endif /* not XML_MIN_SIZE */
999
1000#ifdef XML_NS
1001
1002static const struct normal_encoding big2_encoding_ns = {
1003  { VTABLE, 2, 0,
1004#if BYTEORDER == 4321
1005  1
1006#else
1007  0
1008#endif
1009  },
1010  {
1011#include "asciitab.h"
1012#include "latin1tab.h"
1013  },
1014  STANDARD_VTABLE(big2_) NULL_VTABLE
1015};
1016
1017#endif
1018
1019static const struct normal_encoding big2_encoding = {
1020  { VTABLE, 2, 0,
1021#if BYTEORDER == 4321
1022  1
1023#else
1024  0
1025#endif
1026  },
1027  {
1028#define BT_COLON BT_NMSTRT
1029#include "asciitab.h"
1030#undef BT_COLON
1031#include "latin1tab.h"
1032  },
1033  STANDARD_VTABLE(big2_) NULL_VTABLE
1034};
1035
1036#if BYTEORDER != 1234
1037
1038#ifdef XML_NS
1039
1040static const struct normal_encoding internal_big2_encoding_ns = {
1041  { VTABLE, 2, 0, 1 },
1042  {
1043#include "iasciitab.h"
1044#include "latin1tab.h"
1045  },
1046  STANDARD_VTABLE(big2_) NULL_VTABLE
1047};
1048
1049#endif
1050
1051static const struct normal_encoding internal_big2_encoding = {
1052  { VTABLE, 2, 0, 1 },
1053  {
1054#define BT_COLON BT_NMSTRT
1055#include "iasciitab.h"
1056#undef BT_COLON
1057#include "latin1tab.h"
1058  },
1059  STANDARD_VTABLE(big2_) NULL_VTABLE
1060};
1061
1062#endif
1063
1064#undef PREFIX
1065
1066static int FASTCALL
1067streqci(const char *s1, const char *s2)
1068{
1069  for (;;) {
1070    char c1 = *s1++;
1071    char c2 = *s2++;
1072    if (ASCII_a <= c1 && c1 <= ASCII_z)
1073      c1 += ASCII_A - ASCII_a;
1074    if (ASCII_a <= c2 && c2 <= ASCII_z)
1075      /* The following line will never get executed.  streqci() is
1076       * only called from two places, both of which guarantee to put
1077       * upper-case strings into s2.
1078       */
1079      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1080    if (c1 != c2)
1081      return 0;
1082    if (!c1)
1083      break;
1084  }
1085  return 1;
1086}
1087
1088static void PTRCALL
1089initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
1090                   const char *end, POSITION *pos)
1091{
1092  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1093}
1094
1095static int
1096toAscii(const ENCODING *enc, const char *ptr, const char *end)
1097{
1098  char buf[1];
1099  char *p = buf;
1100  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1101  if (p == buf)
1102    return -1;
1103  else
1104    return buf[0];
1105}
1106
1107static int FASTCALL
1108isSpace(int c)
1109{
1110  switch (c) {
1111  case 0x20:
1112  case 0xD:
1113  case 0xA:
1114  case 0x9:
1115    return 1;
1116  }
1117  return 0;
1118}
1119
1120/* Return 1 if there's just optional white space or there's an S
1121   followed by name=val.
1122*/
1123static int
1124parsePseudoAttribute(const ENCODING *enc,
1125                     const char *ptr,
1126                     const char *end,
1127                     const char **namePtr,
1128                     const char **nameEndPtr,
1129                     const char **valPtr,
1130                     const char **nextTokPtr)
1131{
1132  int c;
1133  char open;
1134  if (ptr == end) {
1135    *namePtr = NULL;
1136    return 1;
1137  }
1138  if (!isSpace(toAscii(enc, ptr, end))) {
1139    *nextTokPtr = ptr;
1140    return 0;
1141  }
1142  do {
1143    ptr += enc->minBytesPerChar;
1144  } while (isSpace(toAscii(enc, ptr, end)));
1145  if (ptr == end) {
1146    *namePtr = NULL;
1147    return 1;
1148  }
1149  *namePtr = ptr;
1150  for (;;) {
1151    c = toAscii(enc, ptr, end);
1152    if (c == -1) {
1153      *nextTokPtr = ptr;
1154      return 0;
1155    }
1156    if (c == ASCII_EQUALS) {
1157      *nameEndPtr = ptr;
1158      break;
1159    }
1160    if (isSpace(c)) {
1161      *nameEndPtr = ptr;
1162      do {
1163        ptr += enc->minBytesPerChar;
1164      } while (isSpace(c = toAscii(enc, ptr, end)));
1165      if (c != ASCII_EQUALS) {
1166        *nextTokPtr = ptr;
1167        return 0;
1168      }
1169      break;
1170    }
1171    ptr += enc->minBytesPerChar;
1172  }
1173  if (ptr == *namePtr) {
1174    *nextTokPtr = ptr;
1175    return 0;
1176  }
1177  ptr += enc->minBytesPerChar;
1178  c = toAscii(enc, ptr, end);
1179  while (isSpace(c)) {
1180    ptr += enc->minBytesPerChar;
1181    c = toAscii(enc, ptr, end);
1182  }
1183  if (c != ASCII_QUOT && c != ASCII_APOS) {
1184    *nextTokPtr = ptr;
1185    return 0;
1186  }
1187  open = (char)c;
1188  ptr += enc->minBytesPerChar;
1189  *valPtr = ptr;
1190  for (;; ptr += enc->minBytesPerChar) {
1191    c = toAscii(enc, ptr, end);
1192    if (c == open)
1193      break;
1194    if (!(ASCII_a <= c && c <= ASCII_z)
1195        && !(ASCII_A <= c && c <= ASCII_Z)
1196        && !(ASCII_0 <= c && c <= ASCII_9)
1197        && c != ASCII_PERIOD
1198        && c != ASCII_MINUS
1199        && c != ASCII_UNDERSCORE) {
1200      *nextTokPtr = ptr;
1201      return 0;
1202    }
1203  }
1204  *nextTokPtr = ptr + enc->minBytesPerChar;
1205  return 1;
1206}
1207
1208static const char KW_version[] = {
1209  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1210};
1211
1212static const char KW_encoding[] = {
1213  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1214};
1215
1216static const char KW_standalone[] = {
1217  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1218  ASCII_n, ASCII_e, '\0'
1219};
1220
1221static const char KW_yes[] = {
1222  ASCII_y, ASCII_e, ASCII_s,  '\0'
1223};
1224
1225static const char KW_no[] = {
1226  ASCII_n, ASCII_o,  '\0'
1227};
1228
1229static int
1230doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1231                                                 const char *,
1232                                                 const char *),
1233               int isGeneralTextEntity,
1234               const ENCODING *enc,
1235               const char *ptr,
1236               const char *end,
1237               const char **badPtr,
1238               const char **versionPtr,
1239               const char **versionEndPtr,
1240               const char **encodingName,
1241               const ENCODING **encoding,
1242               int *standalone)
1243{
1244  const char *val = NULL;
1245  const char *name = NULL;
1246  const char *nameEnd = NULL;
1247  ptr += 5 * enc->minBytesPerChar;
1248  end -= 2 * enc->minBytesPerChar;
1249  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1250      || !name) {
1251    *badPtr = ptr;
1252    return 0;
1253  }
1254  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1255    if (!isGeneralTextEntity) {
1256      *badPtr = name;
1257      return 0;
1258    }
1259  }
1260  else {
1261    if (versionPtr)
1262      *versionPtr = val;
1263    if (versionEndPtr)
1264      *versionEndPtr = ptr;
1265    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1266      *badPtr = ptr;
1267      return 0;
1268    }
1269    if (!name) {
1270      if (isGeneralTextEntity) {
1271        /* a TextDecl must have an EncodingDecl */
1272        *badPtr = ptr;
1273        return 0;
1274      }
1275      return 1;
1276    }
1277  }
1278  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1279    int c = toAscii(enc, val, end);
1280    if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1281      *badPtr = val;
1282      return 0;
1283    }
1284    if (encodingName)
1285      *encodingName = val;
1286    if (encoding)
1287      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1288    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1289      *badPtr = ptr;
1290      return 0;
1291    }
1292    if (!name)
1293      return 1;
1294  }
1295  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1296      || isGeneralTextEntity) {
1297    *badPtr = name;
1298    return 0;
1299  }
1300  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1301    if (standalone)
1302      *standalone = 1;
1303  }
1304  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1305    if (standalone)
1306      *standalone = 0;
1307  }
1308  else {
1309    *badPtr = val;
1310    return 0;
1311  }
1312  while (isSpace(toAscii(enc, ptr, end)))
1313    ptr += enc->minBytesPerChar;
1314  if (ptr != end) {
1315    *badPtr = ptr;
1316    return 0;
1317  }
1318  return 1;
1319}
1320
1321static int FASTCALL
1322checkCharRefNumber(int result)
1323{
1324  switch (result >> 8) {
1325  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1326  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1327    return -1;
1328  case 0:
1329    if (latin1_encoding.type[result] == BT_NONXML)
1330      return -1;
1331    break;
1332  case 0xFF:
1333    if (result == 0xFFFE || result == 0xFFFF)
1334      return -1;
1335    break;
1336  }
1337  return result;
1338}
1339
1340int FASTCALL
1341XmlUtf8Encode(int c, char *buf)
1342{
1343  enum {
1344    /* minN is minimum legal resulting value for N byte sequence */
1345    min2 = 0x80,
1346    min3 = 0x800,
1347    min4 = 0x10000
1348  };
1349
1350  if (c < 0)
1351    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1352  if (c < min2) {
1353    buf[0] = (char)(c | UTF8_cval1);
1354    return 1;
1355  }
1356  if (c < min3) {
1357    buf[0] = (char)((c >> 6) | UTF8_cval2);
1358    buf[1] = (char)((c & 0x3f) | 0x80);
1359    return 2;
1360  }
1361  if (c < min4) {
1362    buf[0] = (char)((c >> 12) | UTF8_cval3);
1363    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1364    buf[2] = (char)((c & 0x3f) | 0x80);
1365    return 3;
1366  }
1367  if (c < 0x110000) {
1368    buf[0] = (char)((c >> 18) | UTF8_cval4);
1369    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1370    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1371    buf[3] = (char)((c & 0x3f) | 0x80);
1372    return 4;
1373  }
1374  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1375}
1376
1377int FASTCALL
1378XmlUtf16Encode(int charNum, unsigned short *buf)
1379{
1380  if (charNum < 0)
1381    return 0;
1382  if (charNum < 0x10000) {
1383    buf[0] = (unsigned short)charNum;
1384    return 1;
1385  }
1386  if (charNum < 0x110000) {
1387    charNum -= 0x10000;
1388    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1389    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1390    return 2;
1391  }
1392  return 0;
1393}
1394
1395struct unknown_encoding {
1396  struct normal_encoding normal;
1397  CONVERTER convert;
1398  void *userData;
1399  unsigned short utf16[256];
1400  char utf8[256][4];
1401};
1402
1403#define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1404
1405int
1406XmlSizeOfUnknownEncoding(void)
1407{
1408  return sizeof(struct unknown_encoding);
1409}
1410
1411static int PTRFASTCALL
1412unknown_isName(const ENCODING *enc, const char *p)
1413{
1414  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1415  int c = uenc->convert(uenc->userData, p);
1416  if (c & ~0xFFFF)
1417    return 0;
1418  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1419}
1420
1421static int PTRFASTCALL
1422unknown_isNmstrt(const ENCODING *enc, const char *p)
1423{
1424  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1425  int c = uenc->convert(uenc->userData, p);
1426  if (c & ~0xFFFF)
1427    return 0;
1428  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1429}
1430
1431static int PTRFASTCALL
1432unknown_isInvalid(const ENCODING *enc, const char *p)
1433{
1434  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1435  int c = uenc->convert(uenc->userData, p);
1436  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1437}
1438
1439static enum XML_Convert_Result PTRCALL
1440unknown_toUtf8(const ENCODING *enc,
1441               const char **fromP, const char *fromLim,
1442               char **toP, const char *toLim)
1443{
1444  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1445  char buf[XML_UTF8_ENCODE_MAX];
1446  for (;;) {
1447    const char *utf8;
1448    int n;
1449    if (*fromP == fromLim)
1450      return XML_CONVERT_COMPLETED;
1451    utf8 = uenc->utf8[(unsigned char)**fromP];
1452    n = *utf8++;
1453    if (n == 0) {
1454      int c = uenc->convert(uenc->userData, *fromP);
1455      n = XmlUtf8Encode(c, buf);
1456      if (n > toLim - *toP)
1457        return XML_CONVERT_OUTPUT_EXHAUSTED;
1458      utf8 = buf;
1459      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1460                 - (BT_LEAD2 - 2));
1461    }
1462    else {
1463      if (n > toLim - *toP)
1464        return XML_CONVERT_OUTPUT_EXHAUSTED;
1465      (*fromP)++;
1466    }
1467    memcpy(*toP, utf8, n);
1468    *toP += n;
1469  }
1470}
1471
1472static enum XML_Convert_Result PTRCALL
1473unknown_toUtf16(const ENCODING *enc,
1474                const char **fromP, const char *fromLim,
1475                unsigned short **toP, const unsigned short *toLim)
1476{
1477  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1478  while (*fromP < fromLim && *toP < toLim) {
1479    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1480    if (c == 0) {
1481      c = (unsigned short)
1482          uenc->convert(uenc->userData, *fromP);
1483      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1484                 - (BT_LEAD2 - 2));
1485    }
1486    else
1487      (*fromP)++;
1488    *(*toP)++ = c;
1489  }
1490
1491  if ((*toP == toLim) && (*fromP < fromLim))
1492    return XML_CONVERT_OUTPUT_EXHAUSTED;
1493  else
1494    return XML_CONVERT_COMPLETED;
1495}
1496
1497ENCODING *
1498XmlInitUnknownEncoding(void *mem,
1499                       int *table,
1500                       CONVERTER convert,
1501                       void *userData)
1502{
1503  int i;
1504  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1505  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1506    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1507  for (i = 0; i < 128; i++)
1508    if (latin1_encoding.type[i] != BT_OTHER
1509        && latin1_encoding.type[i] != BT_NONXML
1510        && table[i] != i)
1511      return 0;
1512  for (i = 0; i < 256; i++) {
1513    int c = table[i];
1514    if (c == -1) {
1515      e->normal.type[i] = BT_MALFORM;
1516      /* This shouldn't really get used. */
1517      e->utf16[i] = 0xFFFF;
1518      e->utf8[i][0] = 1;
1519      e->utf8[i][1] = 0;
1520    }
1521    else if (c < 0) {
1522      if (c < -4)
1523        return 0;
1524      /* Multi-byte sequences need a converter function */
1525      if (!convert)
1526        return 0;
1527      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1528      e->utf8[i][0] = 0;
1529      e->utf16[i] = 0;
1530    }
1531    else if (c < 0x80) {
1532      if (latin1_encoding.type[c] != BT_OTHER
1533          && latin1_encoding.type[c] != BT_NONXML
1534          && c != i)
1535        return 0;
1536      e->normal.type[i] = latin1_encoding.type[c];
1537      e->utf8[i][0] = 1;
1538      e->utf8[i][1] = (char)c;
1539      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1540    }
1541    else if (checkCharRefNumber(c) < 0) {
1542      e->normal.type[i] = BT_NONXML;
1543      /* This shouldn't really get used. */
1544      e->utf16[i] = 0xFFFF;
1545      e->utf8[i][0] = 1;
1546      e->utf8[i][1] = 0;
1547    }
1548    else {
1549      if (c > 0xFFFF)
1550        return 0;
1551      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1552        e->normal.type[i] = BT_NMSTRT;
1553      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1554        e->normal.type[i] = BT_NAME;
1555      else
1556        e->normal.type[i] = BT_OTHER;
1557      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1558      e->utf16[i] = (unsigned short)c;
1559    }
1560  }
1561  e->userData = userData;
1562  e->convert = convert;
1563  if (convert) {
1564    e->normal.isName2 = unknown_isName;
1565    e->normal.isName3 = unknown_isName;
1566    e->normal.isName4 = unknown_isName;
1567    e->normal.isNmstrt2 = unknown_isNmstrt;
1568    e->normal.isNmstrt3 = unknown_isNmstrt;
1569    e->normal.isNmstrt4 = unknown_isNmstrt;
1570    e->normal.isInvalid2 = unknown_isInvalid;
1571    e->normal.isInvalid3 = unknown_isInvalid;
1572    e->normal.isInvalid4 = unknown_isInvalid;
1573  }
1574  e->normal.enc.utf8Convert = unknown_toUtf8;
1575  e->normal.enc.utf16Convert = unknown_toUtf16;
1576  return &(e->normal.enc);
1577}
1578
1579/* If this enumeration is changed, getEncodingIndex and encodings
1580must also be changed. */
1581enum {
1582  UNKNOWN_ENC = -1,
1583  ISO_8859_1_ENC = 0,
1584  US_ASCII_ENC,
1585  UTF_8_ENC,
1586  UTF_16_ENC,
1587  UTF_16BE_ENC,
1588  UTF_16LE_ENC,
1589  /* must match encodingNames up to here */
1590  NO_ENC
1591};
1592
1593static const char KW_ISO_8859_1[] = {
1594  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1595  ASCII_MINUS, ASCII_1, '\0'
1596};
1597static const char KW_US_ASCII[] = {
1598  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1599  '\0'
1600};
1601static const char KW_UTF_8[] =  {
1602  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1603};
1604static const char KW_UTF_16[] = {
1605  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1606};
1607static const char KW_UTF_16BE[] = {
1608  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1609  '\0'
1610};
1611static const char KW_UTF_16LE[] = {
1612  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1613  '\0'
1614};
1615
1616static int FASTCALL
1617getEncodingIndex(const char *name)
1618{
1619  static const char * const encodingNames[] = {
1620    KW_ISO_8859_1,
1621    KW_US_ASCII,
1622    KW_UTF_8,
1623    KW_UTF_16,
1624    KW_UTF_16BE,
1625    KW_UTF_16LE,
1626  };
1627  int i;
1628  if (name == NULL)
1629    return NO_ENC;
1630  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1631    if (streqci(name, encodingNames[i]))
1632      return i;
1633  return UNKNOWN_ENC;
1634}
1635
1636/* For binary compatibility, we store the index of the encoding
1637   specified at initialization in the isUtf16 member.
1638*/
1639
1640#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1641#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1642
1643/* This is what detects the encoding.  encodingTable maps from
1644   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1645   the external (protocol) specified encoding; state is
1646   XML_CONTENT_STATE if we're parsing an external text entity, and
1647   XML_PROLOG_STATE otherwise.
1648*/
1649
1650
1651static int
1652initScan(const ENCODING * const *encodingTable,
1653         const INIT_ENCODING *enc,
1654         int state,
1655         const char *ptr,
1656         const char *end,
1657         const char **nextTokPtr)
1658{
1659  const ENCODING **encPtr;
1660
1661  if (ptr >= end)
1662    return XML_TOK_NONE;
1663  encPtr = enc->encPtr;
1664  if (ptr + 1 == end) {
1665    /* only a single byte available for auto-detection */
1666#ifndef XML_DTD /* FIXME */
1667    /* a well-formed document entity must have more than one byte */
1668    if (state != XML_CONTENT_STATE)
1669      return XML_TOK_PARTIAL;
1670#endif
1671    /* so we're parsing an external text entity... */
1672    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1673    switch (INIT_ENC_INDEX(enc)) {
1674    case UTF_16_ENC:
1675    case UTF_16LE_ENC:
1676    case UTF_16BE_ENC:
1677      return XML_TOK_PARTIAL;
1678    }
1679    switch ((unsigned char)*ptr) {
1680    case 0xFE:
1681    case 0xFF:
1682    case 0xEF: /* possibly first byte of UTF-8 BOM */
1683      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1684          && state == XML_CONTENT_STATE)
1685        break;
1686      /* fall through */
1687    case 0x00:
1688    case 0x3C:
1689      return XML_TOK_PARTIAL;
1690    }
1691  }
1692  else {
1693    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1694    case 0xFEFF:
1695      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1696          && state == XML_CONTENT_STATE)
1697        break;
1698      *nextTokPtr = ptr + 2;
1699      *encPtr = encodingTable[UTF_16BE_ENC];
1700      return XML_TOK_BOM;
1701    /* 00 3C is handled in the default case */
1702    case 0x3C00:
1703      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1704           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1705          && state == XML_CONTENT_STATE)
1706        break;
1707      *encPtr = encodingTable[UTF_16LE_ENC];
1708      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1709    case 0xFFFE:
1710      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1711          && state == XML_CONTENT_STATE)
1712        break;
1713      *nextTokPtr = ptr + 2;
1714      *encPtr = encodingTable[UTF_16LE_ENC];
1715      return XML_TOK_BOM;
1716    case 0xEFBB:
1717      /* Maybe a UTF-8 BOM (EF BB BF) */
1718      /* If there's an explicitly specified (external) encoding
1719         of ISO-8859-1 or some flavour of UTF-16
1720         and this is an external text entity,
1721         don't look for the BOM,
1722         because it might be a legal data.
1723      */
1724      if (state == XML_CONTENT_STATE) {
1725        int e = INIT_ENC_INDEX(enc);
1726        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1727            || e == UTF_16LE_ENC || e == UTF_16_ENC)
1728          break;
1729      }
1730      if (ptr + 2 == end)
1731        return XML_TOK_PARTIAL;
1732      if ((unsigned char)ptr[2] == 0xBF) {
1733        *nextTokPtr = ptr + 3;
1734        *encPtr = encodingTable[UTF_8_ENC];
1735        return XML_TOK_BOM;
1736      }
1737      break;
1738    default:
1739      if (ptr[0] == '\0') {
1740        /* 0 isn't a legal data character. Furthermore a document
1741           entity can only start with ASCII characters.  So the only
1742           way this can fail to be big-endian UTF-16 if it it's an
1743           external parsed general entity that's labelled as
1744           UTF-16LE.
1745        */
1746        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1747          break;
1748        *encPtr = encodingTable[UTF_16BE_ENC];
1749        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1750      }
1751      else if (ptr[1] == '\0') {
1752        /* We could recover here in the case:
1753            - parsing an external entity
1754            - second byte is 0
1755            - no externally specified encoding
1756            - no encoding declaration
1757           by assuming UTF-16LE.  But we don't, because this would mean when
1758           presented just with a single byte, we couldn't reliably determine
1759           whether we needed further bytes.
1760        */
1761        if (state == XML_CONTENT_STATE)
1762          break;
1763        *encPtr = encodingTable[UTF_16LE_ENC];
1764        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1765      }
1766      break;
1767    }
1768  }
1769  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1770  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1771}
1772
1773
1774#define NS(x) x
1775#define ns(x) x
1776#define XML_TOK_NS_C
1777#include "xmltok_ns.c"
1778#undef XML_TOK_NS_C
1779#undef NS
1780#undef ns
1781
1782#ifdef XML_NS
1783
1784#define NS(x) x ## NS
1785#define ns(x) x ## _ns
1786
1787#define XML_TOK_NS_C
1788#include "xmltok_ns.c"
1789#undef XML_TOK_NS_C
1790
1791#undef NS
1792#undef ns
1793
1794ENCODING *
1795XmlInitUnknownEncodingNS(void *mem,
1796                         int *table,
1797                         CONVERTER convert,
1798                         void *userData)
1799{
1800  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1801  if (enc)
1802    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1803  return enc;
1804}
1805
1806#endif /* XML_NS */
1807