read.c revision 64339d36f8bd4db5025fe2988eda22b491a9219c
1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 1998-2012, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*
11* File read.c
12*
13* Modification History:
14*
15*   Date        Name        Description
16*   05/26/99    stephen     Creation.
17*   5/10/01     Ram         removed ustdio dependency
18*******************************************************************************
19*/
20
21#include "read.h"
22#include "errmsg.h"
23#include "unicode/ustring.h"
24#include "unicode/utf16.h"
25
26#define OPENBRACE    0x007B
27#define CLOSEBRACE   0x007D
28#define COMMA        0x002C
29#define QUOTE        0x0022
30#define ESCAPE       0x005C
31#define SLASH        0x002F
32#define ASTERISK     0x002A
33#define SPACE        0x0020
34#define COLON        0x003A
35#define BADBOM       0xFFFE
36#define CR           0x000D
37#define LF           0x000A
38
39static int32_t lineCount;
40
41/* Protos */
42static enum ETokenType getStringToken(UCHARBUF *buf,
43                                      UChar32 initialChar,
44                                      struct UString *token,
45                                      UErrorCode *status);
46
47static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
48static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
49static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
50static UBool   isWhitespace          (UChar32 c);
51static UBool   isNewline             (UChar32 c);
52
53U_CFUNC void resetLineNumber() {
54    lineCount = 1;
55}
56
57/* Read and return the next token from the stream.  If the token is of
58   type eString, fill in the token parameter with the token.  If the
59   token is eError, then the status parameter will contain the
60   specific error.  This will be eItemNotFound at the end of file,
61   indicating that all tokens have been returned.  This method will
62   never return eString twice in a row; instead, multiple adjacent
63   string tokens will be merged into one, with no intervening
64   space. */
65U_CFUNC enum ETokenType
66getNextToken(UCHARBUF* buf,
67             struct UString *token,
68             uint32_t *linenumber, /* out: linenumber of token */
69             struct UString *comment,
70             UErrorCode *status) {
71    enum ETokenType result;
72    UChar32         c;
73
74    if (U_FAILURE(*status)) {
75        return TOK_ERROR;
76    }
77
78    /* Skip whitespace */
79    c = getNextChar(buf, TRUE, comment, status);
80
81    if (U_FAILURE(*status)) {
82        return TOK_ERROR;
83    }
84
85    *linenumber = lineCount;
86
87    switch(c) {
88    case BADBOM:
89        return TOK_ERROR;
90    case OPENBRACE:
91        return TOK_OPEN_BRACE;
92    case CLOSEBRACE:
93        return TOK_CLOSE_BRACE;
94    case COMMA:
95        return TOK_COMMA;
96    case U_EOF:
97        return TOK_EOF;
98    case COLON:
99        return TOK_COLON;
100
101    default:
102        result = getStringToken(buf, c, token, status);
103    }
104
105    *linenumber = lineCount;
106    return result;
107}
108
109/* Copy a string token into the given UnicodeString.  Upon entry, we
110   have already read the first character of the string token, which is
111   not a whitespace character (but may be a QUOTE or ESCAPE). This
112   function reads all subsequent characters that belong with this
113   string, and copy them into the token parameter. The other
114   important, and slightly convoluted purpose of this function is to
115   merge adjacent strings.  It looks forward a bit, and if the next
116   non comment, non whitespace item is a string, it reads it in as
117   well.  If two adjacent strings are quoted, they are merged without
118   intervening space.  Otherwise a single SPACE character is
119   inserted. */
120static enum ETokenType getStringToken(UCHARBUF* buf,
121                                      UChar32 initialChar,
122                                      struct UString *token,
123                                      UErrorCode *status) {
124    UBool    lastStringWasQuoted;
125    UChar32  c;
126    UChar    target[3] = { '\0' };
127    UChar    *pTarget   = target;
128    int      len=0;
129    UBool    isFollowingCharEscaped=FALSE;
130    UBool    isNLUnescaped = FALSE;
131    UChar32  prevC=0;
132
133    /* We are guaranteed on entry that initialChar is not a whitespace
134       character. If we are at the EOF, or have some other problem, it
135       doesn't matter; we still want to validly return the initialChar
136       (if nothing else) as a string token. */
137
138    if (U_FAILURE(*status)) {
139        return TOK_ERROR;
140    }
141
142    /* setup */
143    lastStringWasQuoted = FALSE;
144    c = initialChar;
145    ustr_setlen(token, 0, status);
146
147    if (U_FAILURE(*status)) {
148        return TOK_ERROR;
149    }
150
151    for (;;) {
152        if (c == QUOTE) {
153            if (!lastStringWasQuoted && token->fLength > 0) {
154                ustr_ucat(token, SPACE, status);
155
156                if (U_FAILURE(*status)) {
157                    return TOK_ERROR;
158                }
159            }
160
161            lastStringWasQuoted = TRUE;
162
163            for (;;) {
164                c = ucbuf_getc(buf,status);
165
166                /* EOF reached */
167                if (c == U_EOF) {
168                    return TOK_EOF;
169                }
170
171                /* Unterminated quoted strings */
172                if (U_FAILURE(*status)) {
173                    return TOK_ERROR;
174                }
175
176                if (c == QUOTE && !isFollowingCharEscaped) {
177                    break;
178                }
179
180                if (c == ESCAPE  && !isFollowingCharEscaped) {
181                    pTarget = target;
182                    c       = unescape(buf, status);
183
184                    if (c == U_ERR) {
185                        return TOK_ERROR;
186                    }
187                    if(c == CR || c == LF){
188                        isNLUnescaped = TRUE;
189                    }
190                }
191
192                if(c==ESCAPE && !isFollowingCharEscaped){
193                    isFollowingCharEscaped = TRUE;
194                }else{
195                    U_APPEND_CHAR32(c, pTarget,len);
196                    pTarget = target;
197                    ustr_uscat(token, pTarget,len, status);
198                    isFollowingCharEscaped = FALSE;
199                    len=0;
200                    if(c == CR || c == LF){
201                        if(isNLUnescaped == FALSE && prevC!=CR){
202                            lineCount++;
203                        }
204                        isNLUnescaped = FALSE;
205                    }
206                }
207
208                if (U_FAILURE(*status)) {
209                    return TOK_ERROR;
210                }
211                prevC = c;
212            }
213        } else {
214            if (token->fLength > 0) {
215                ustr_ucat(token, SPACE, status);
216
217                if (U_FAILURE(*status)) {
218                    return TOK_ERROR;
219                }
220            }
221
222            if(lastStringWasQuoted){
223                if(getShowWarning()){
224                    warning(lineCount, "Mixing quoted and unquoted strings");
225                }
226                if(isStrict()){
227                    return TOK_ERROR;
228                }
229
230            }
231
232            lastStringWasQuoted = FALSE;
233
234            /* if we reach here we are mixing
235             * quoted and unquoted strings
236             * warn in normal mode and error in
237             * pedantic mode
238             */
239
240            if (c == ESCAPE) {
241                pTarget = target;
242                c       = unescape(buf, status);
243
244                /* EOF reached */
245                if (c == U_EOF) {
246                    return TOK_ERROR;
247                }
248            }
249
250            U_APPEND_CHAR32(c, pTarget,len);
251            pTarget = target;
252            ustr_uscat(token, pTarget,len, status);
253            len=0;
254
255            if (U_FAILURE(*status)) {
256                return TOK_ERROR;
257            }
258
259            for (;;) {
260                /* DON'T skip whitespace */
261                c = getNextChar(buf, FALSE, NULL, status);
262
263                /* EOF reached */
264                if (c == U_EOF) {
265                    ucbuf_ungetc(c, buf);
266                    return TOK_STRING;
267                }
268
269                if (U_FAILURE(*status)) {
270                    return TOK_STRING;
271                }
272
273                if (c == QUOTE
274                        || c == OPENBRACE
275                        || c == CLOSEBRACE
276                        || c == COMMA
277                        || c == COLON) {
278                    ucbuf_ungetc(c, buf);
279                    break;
280                }
281
282                if (isWhitespace(c)) {
283                    break;
284                }
285
286                if (c == ESCAPE) {
287                    pTarget = target;
288                    c       = unescape(buf, status);
289
290                    if (c == U_ERR) {
291                        return TOK_ERROR;
292                    }
293                }
294
295                U_APPEND_CHAR32(c, pTarget,len);
296                pTarget = target;
297                ustr_uscat(token, pTarget,len, status);
298                len=0;
299                if (U_FAILURE(*status)) {
300                    return TOK_ERROR;
301                }
302            }
303        }
304
305        /* DO skip whitespace */
306        c = getNextChar(buf, TRUE, NULL, status);
307
308        if (U_FAILURE(*status)) {
309            return TOK_STRING;
310        }
311
312        if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
313            ucbuf_ungetc(c, buf);
314            return TOK_STRING;
315        }
316    }
317}
318
319/* Retrieve the next character.  If skipwhite is
320   true, whitespace is skipped as well. */
321static UChar32 getNextChar(UCHARBUF* buf,
322                           UBool skipwhite,
323                           struct UString *token,
324                           UErrorCode *status) {
325    UChar32 c, c2;
326
327    if (U_FAILURE(*status)) {
328        return U_EOF;
329    }
330
331    for (;;) {
332        c = ucbuf_getc(buf,status);
333
334        if (c == U_EOF) {
335            return U_EOF;
336        }
337
338        if (skipwhite && isWhitespace(c)) {
339            continue;
340        }
341
342        /* This also handles the get() failing case */
343        if (c != SLASH) {
344            return c;
345        }
346
347        c = ucbuf_getc(buf,status); /* "/c" */
348
349        if (c == U_EOF) {
350            return U_EOF;
351        }
352
353        switch (c) {
354        case SLASH:  /* "//" */
355            seekUntilNewline(buf, NULL, status);
356            break;
357
358        case ASTERISK:  /* " / * " */
359            c2 = ucbuf_getc(buf, status); /* "/ * c" */
360            if(c2 == ASTERISK){  /* "/ * *" */
361                /* parse multi-line comment and store it in token*/
362                seekUntilEndOfComment(buf, token, status);
363            } else {
364                ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
365                seekUntilEndOfComment(buf, NULL, status);
366            }
367            break;
368
369        default:
370            ucbuf_ungetc(c, buf); /* "/c" - put back the c */
371            /* If get() failed this is a NOP */
372            return SLASH;
373        }
374
375    }
376}
377
378static void seekUntilNewline(UCHARBUF* buf,
379                             struct UString *token,
380                             UErrorCode *status) {
381    UChar32 c;
382
383    if (U_FAILURE(*status)) {
384        return;
385    }
386
387    do {
388        c = ucbuf_getc(buf,status);
389        /* add the char to token */
390        if(token!=NULL){
391            ustr_u32cat(token, c, status);
392        }
393    } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
394}
395
396static void seekUntilEndOfComment(UCHARBUF *buf,
397                                  struct UString *token,
398                                  UErrorCode *status) {
399    UChar32  c, d;
400    uint32_t line;
401
402    if (U_FAILURE(*status)) {
403        return;
404    }
405
406    line = lineCount;
407
408    do {
409        c = ucbuf_getc(buf, status);
410
411        if (c == ASTERISK) {
412            d = ucbuf_getc(buf, status);
413
414            if (d != SLASH) {
415                ucbuf_ungetc(d, buf);
416            } else {
417                break;
418            }
419        }
420        /* add the char to token */
421        if(token!=NULL){
422            ustr_u32cat(token, c, status);
423        }
424        /* increment the lineCount */
425        isNewline(c);
426
427    } while (c != U_EOF && *status == U_ZERO_ERROR);
428
429    if (c == U_EOF) {
430        *status = U_INVALID_FORMAT_ERROR;
431        error(line, "unterminated comment detected");
432    }
433}
434
435U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
436    if (U_FAILURE(*status)) {
437        return U_EOF;
438    }
439
440    /* We expect to be called after the ESCAPE has been seen, but
441     * u_fgetcx needs an ESCAPE to do its magic. */
442    ucbuf_ungetc(ESCAPE, buf);
443
444    return ucbuf_getcx32(buf, status);
445}
446
447static UBool isWhitespace(UChar32 c) {
448    switch (c) {
449        /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
450    case 0x000A:
451    case 0x2029:
452        lineCount++;
453    case 0x000D:
454    case 0x0020:
455    case 0x0009:
456    case 0xFEFF:
457        return TRUE;
458
459    default:
460        return FALSE;
461    }
462}
463
464static UBool isNewline(UChar32 c) {
465    switch (c) {
466        /* '\n', '\r', 0x2029 */
467    case 0x000A:
468    case 0x2029:
469        lineCount++;
470    case 0x000D:
471        return TRUE;
472
473    default:
474        return FALSE;
475    }
476}
477