1/*
2*******************************************************************************
3*
4*   Copyright (C) 1998-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*
9* File read.c
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   05/26/99    stephen     Creation.
15*   5/10/01     Ram         removed ustdio dependency
16*******************************************************************************
17*/
18
19#include "read.h"
20#include "errmsg.h"
21#include "unicode/ustring.h"
22#include "unicode/utf16.h"
23
24#define OPENBRACE    0x007B
25#define CLOSEBRACE   0x007D
26#define COMMA        0x002C
27#define QUOTE        0x0022
28#define ESCAPE       0x005C
29#define SLASH        0x002F
30#define ASTERISK     0x002A
31#define SPACE        0x0020
32#define COLON        0x003A
33#define BADBOM       0xFFFE
34#define CR           0x000D
35#define LF           0x000A
36
37static int32_t lineCount;
38
39/* Protos */
40static enum ETokenType getStringToken(UCHARBUF *buf,
41                                      UChar32 initialChar,
42                                      struct UString *token,
43                                      UErrorCode *status);
44
45static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
46static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
47static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
48static UBool   isWhitespace          (UChar32 c);
49static UBool   isNewline             (UChar32 c);
50
51U_CFUNC void resetLineNumber() {
52    lineCount = 1;
53}
54
55/* Read and return the next token from the stream.  If the token is of
56   type eString, fill in the token parameter with the token.  If the
57   token is eError, then the status parameter will contain the
58   specific error.  This will be eItemNotFound at the end of file,
59   indicating that all tokens have been returned.  This method will
60   never return eString twice in a row; instead, multiple adjacent
61   string tokens will be merged into one, with no intervening
62   space. */
63U_CFUNC enum ETokenType
64getNextToken(UCHARBUF* buf,
65             struct UString *token,
66             uint32_t *linenumber, /* out: linenumber of token */
67             struct UString *comment,
68             UErrorCode *status) {
69    enum ETokenType result;
70    UChar32         c;
71
72    if (U_FAILURE(*status)) {
73        return TOK_ERROR;
74    }
75
76    /* Skip whitespace */
77    c = getNextChar(buf, TRUE, comment, status);
78
79    if (U_FAILURE(*status)) {
80        return TOK_ERROR;
81    }
82
83    *linenumber = lineCount;
84
85    switch(c) {
86    case BADBOM:
87        return TOK_ERROR;
88    case OPENBRACE:
89        return TOK_OPEN_BRACE;
90    case CLOSEBRACE:
91        return TOK_CLOSE_BRACE;
92    case COMMA:
93        return TOK_COMMA;
94    case U_EOF:
95        return TOK_EOF;
96    case COLON:
97        return TOK_COLON;
98
99    default:
100        result = getStringToken(buf, c, token, status);
101    }
102
103    *linenumber = lineCount;
104    return result;
105}
106
107/* Copy a string token into the given UnicodeString.  Upon entry, we
108   have already read the first character of the string token, which is
109   not a whitespace character (but may be a QUOTE or ESCAPE). This
110   function reads all subsequent characters that belong with this
111   string, and copy them into the token parameter. The other
112   important, and slightly convoluted purpose of this function is to
113   merge adjacent strings.  It looks forward a bit, and if the next
114   non comment, non whitespace item is a string, it reads it in as
115   well.  If two adjacent strings are quoted, they are merged without
116   intervening space.  Otherwise a single SPACE character is
117   inserted. */
118static enum ETokenType getStringToken(UCHARBUF* buf,
119                                      UChar32 initialChar,
120                                      struct UString *token,
121                                      UErrorCode *status) {
122    UBool    lastStringWasQuoted;
123    UChar32  c;
124    UChar    target[3] = { '\0' };
125    UChar    *pTarget   = target;
126    int      len=0;
127    UBool    isFollowingCharEscaped=FALSE;
128    UBool    isNLUnescaped = FALSE;
129    UChar32  prevC=0;
130
131    /* We are guaranteed on entry that initialChar is not a whitespace
132       character. If we are at the EOF, or have some other problem, it
133       doesn't matter; we still want to validly return the initialChar
134       (if nothing else) as a string token. */
135
136    if (U_FAILURE(*status)) {
137        return TOK_ERROR;
138    }
139
140    /* setup */
141    lastStringWasQuoted = FALSE;
142    c = initialChar;
143    ustr_setlen(token, 0, status);
144
145    if (U_FAILURE(*status)) {
146        return TOK_ERROR;
147    }
148
149    for (;;) {
150        if (c == QUOTE) {
151            if (!lastStringWasQuoted && token->fLength > 0) {
152                ustr_ucat(token, SPACE, status);
153
154                if (U_FAILURE(*status)) {
155                    return TOK_ERROR;
156                }
157            }
158
159            lastStringWasQuoted = TRUE;
160
161            for (;;) {
162                c = ucbuf_getc(buf,status);
163
164                /* EOF reached */
165                if (c == U_EOF) {
166                    return TOK_EOF;
167                }
168
169                /* Unterminated quoted strings */
170                if (U_FAILURE(*status)) {
171                    return TOK_ERROR;
172                }
173
174                if (c == QUOTE && !isFollowingCharEscaped) {
175                    break;
176                }
177
178                if (c == ESCAPE  && !isFollowingCharEscaped) {
179                    pTarget = target;
180                    c       = unescape(buf, status);
181
182                    if (c == U_ERR) {
183                        return TOK_ERROR;
184                    }
185                    if(c == CR || c == LF){
186                        isNLUnescaped = TRUE;
187                    }
188                }
189
190                if(c==ESCAPE && !isFollowingCharEscaped){
191                    isFollowingCharEscaped = TRUE;
192                }else{
193                    U_APPEND_CHAR32(c, pTarget,len);
194                    pTarget = target;
195                    ustr_uscat(token, pTarget,len, status);
196                    isFollowingCharEscaped = FALSE;
197                    len=0;
198                    if(c == CR || c == LF){
199                        if(isNLUnescaped == FALSE && prevC!=CR){
200                            lineCount++;
201                        }
202                        isNLUnescaped = FALSE;
203                    }
204                }
205
206                if (U_FAILURE(*status)) {
207                    return TOK_ERROR;
208                }
209                prevC = c;
210            }
211        } else {
212            if (token->fLength > 0) {
213                ustr_ucat(token, SPACE, status);
214
215                if (U_FAILURE(*status)) {
216                    return TOK_ERROR;
217                }
218            }
219
220            if(lastStringWasQuoted){
221                if(getShowWarning()){
222                    warning(lineCount, "Mixing quoted and unquoted strings");
223                }
224                if(isStrict()){
225                    return TOK_ERROR;
226                }
227
228            }
229
230            lastStringWasQuoted = FALSE;
231
232            /* if we reach here we are mixing
233             * quoted and unquoted strings
234             * warn in normal mode and error in
235             * pedantic mode
236             */
237
238            if (c == ESCAPE) {
239                pTarget = target;
240                c       = unescape(buf, status);
241
242                /* EOF reached */
243                if (c == U_EOF) {
244                    return TOK_ERROR;
245                }
246            }
247
248            U_APPEND_CHAR32(c, pTarget,len);
249            pTarget = target;
250            ustr_uscat(token, pTarget,len, status);
251            len=0;
252
253            if (U_FAILURE(*status)) {
254                return TOK_ERROR;
255            }
256
257            for (;;) {
258                /* DON'T skip whitespace */
259                c = getNextChar(buf, FALSE, NULL, status);
260
261                /* EOF reached */
262                if (c == U_EOF) {
263                    ucbuf_ungetc(c, buf);
264                    return TOK_STRING;
265                }
266
267                if (U_FAILURE(*status)) {
268                    return TOK_STRING;
269                }
270
271                if (c == QUOTE
272                        || c == OPENBRACE
273                        || c == CLOSEBRACE
274                        || c == COMMA
275                        || c == COLON) {
276                    ucbuf_ungetc(c, buf);
277                    break;
278                }
279
280                if (isWhitespace(c)) {
281                    break;
282                }
283
284                if (c == ESCAPE) {
285                    pTarget = target;
286                    c       = unescape(buf, status);
287
288                    if (c == U_ERR) {
289                        return TOK_ERROR;
290                    }
291                }
292
293                U_APPEND_CHAR32(c, pTarget,len);
294                pTarget = target;
295                ustr_uscat(token, pTarget,len, status);
296                len=0;
297                if (U_FAILURE(*status)) {
298                    return TOK_ERROR;
299                }
300            }
301        }
302
303        /* DO skip whitespace */
304        c = getNextChar(buf, TRUE, NULL, status);
305
306        if (U_FAILURE(*status)) {
307            return TOK_STRING;
308        }
309
310        if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
311            ucbuf_ungetc(c, buf);
312            return TOK_STRING;
313        }
314    }
315}
316
317/* Retrieve the next character.  If skipwhite is
318   true, whitespace is skipped as well. */
319static UChar32 getNextChar(UCHARBUF* buf,
320                           UBool skipwhite,
321                           struct UString *token,
322                           UErrorCode *status) {
323    UChar32 c, c2;
324
325    if (U_FAILURE(*status)) {
326        return U_EOF;
327    }
328
329    for (;;) {
330        c = ucbuf_getc(buf,status);
331
332        if (c == U_EOF) {
333            return U_EOF;
334        }
335
336        if (skipwhite && isWhitespace(c)) {
337            continue;
338        }
339
340        /* This also handles the get() failing case */
341        if (c != SLASH) {
342            return c;
343        }
344
345        c = ucbuf_getc(buf,status); /* "/c" */
346
347        if (c == U_EOF) {
348            return U_EOF;
349        }
350
351        switch (c) {
352        case SLASH:  /* "//" */
353            seekUntilNewline(buf, NULL, status);
354            break;
355
356        case ASTERISK:  /* " / * " */
357            c2 = ucbuf_getc(buf, status); /* "/ * c" */
358            if(c2 == ASTERISK){  /* "/ * *" */
359                /* parse multi-line comment and store it in token*/
360                seekUntilEndOfComment(buf, token, status);
361            } else {
362                ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
363                seekUntilEndOfComment(buf, NULL, status);
364            }
365            break;
366
367        default:
368            ucbuf_ungetc(c, buf); /* "/c" - put back the c */
369            /* If get() failed this is a NOP */
370            return SLASH;
371        }
372
373    }
374}
375
376static void seekUntilNewline(UCHARBUF* buf,
377                             struct UString *token,
378                             UErrorCode *status) {
379    UChar32 c;
380
381    if (U_FAILURE(*status)) {
382        return;
383    }
384
385    do {
386        c = ucbuf_getc(buf,status);
387        /* add the char to token */
388        if(token!=NULL){
389            ustr_u32cat(token, c, status);
390        }
391    } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
392}
393
394static void seekUntilEndOfComment(UCHARBUF *buf,
395                                  struct UString *token,
396                                  UErrorCode *status) {
397    UChar32  c, d;
398    uint32_t line;
399
400    if (U_FAILURE(*status)) {
401        return;
402    }
403
404    line = lineCount;
405
406    do {
407        c = ucbuf_getc(buf, status);
408
409        if (c == ASTERISK) {
410            d = ucbuf_getc(buf, status);
411
412            if (d != SLASH) {
413                ucbuf_ungetc(d, buf);
414            } else {
415                break;
416            }
417        }
418        /* add the char to token */
419        if(token!=NULL){
420            ustr_u32cat(token, c, status);
421        }
422        /* increment the lineCount */
423        isNewline(c);
424
425    } while (c != U_EOF && *status == U_ZERO_ERROR);
426
427    if (c == U_EOF) {
428        *status = U_INVALID_FORMAT_ERROR;
429        error(line, "unterminated comment detected");
430    }
431}
432
433U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
434    if (U_FAILURE(*status)) {
435        return U_EOF;
436    }
437
438    /* We expect to be called after the ESCAPE has been seen, but
439     * u_fgetcx needs an ESCAPE to do its magic. */
440    ucbuf_ungetc(ESCAPE, buf);
441
442    return ucbuf_getcx32(buf, status);
443}
444
445static UBool isWhitespace(UChar32 c) {
446    switch (c) {
447        /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
448    case 0x000A:
449    case 0x2029:
450        lineCount++;
451    case 0x000D:
452    case 0x0020:
453    case 0x0009:
454    case 0xFEFF:
455        return TRUE;
456
457    default:
458        return FALSE;
459    }
460}
461
462static UBool isNewline(UChar32 c) {
463    switch (c) {
464        /* '\n', '\r', 0x2029 */
465    case 0x000A:
466    case 0x2029:
467        lineCount++;
468    case 0x000D:
469        return TRUE;
470
471    default:
472        return FALSE;
473    }
474}
475