1/*
2*******************************************************************************
3*
4*   Copyright (C) 1998-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*
9* File read.c
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   05/26/99    stephen     Creation.
15*   5/10/01     Ram         removed ustdio dependency
16*******************************************************************************
17*/
18
19#include "read.h"
20#include "errmsg.h"
21#include "unicode/ustring.h"
22
23#define OPENBRACE    0x007B
24#define CLOSEBRACE   0x007D
25#define COMMA        0x002C
26#define QUOTE        0x0022
27#define ESCAPE       0x005C
28#define SLASH        0x002F
29#define ASTERISK     0x002A
30#define SPACE        0x0020
31#define COLON        0x003A
32#define BADBOM       0xFFFE
33#define CR           0x000D
34#define LF           0x000A
35
36static int32_t lineCount;
37
38/* Protos */
39static enum ETokenType getStringToken(UCHARBUF *buf,
40                                      UChar32 initialChar,
41                                      struct UString *token,
42                                      UErrorCode *status);
43
44static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
45static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
46static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
47static UBool   isWhitespace          (UChar32 c);
48static UBool   isNewline             (UChar32 c);
49
50void resetLineNumber() {
51    lineCount = 1;
52}
53
54/* Read and return the next token from the stream.  If the token is of
55   type eString, fill in the token parameter with the token.  If the
56   token is eError, then the status parameter will contain the
57   specific error.  This will be eItemNotFound at the end of file,
58   indicating that all tokens have been returned.  This method will
59   never return eString twice in a row; instead, multiple adjacent
60   string tokens will be merged into one, with no intervening
61   space. */
62enum ETokenType getNextToken(UCHARBUF* buf,
63                             struct UString *token,
64                             uint32_t *linenumber, /* out: linenumber of token */
65                             struct UString *comment,
66                             UErrorCode *status) {
67    enum ETokenType result;
68    UChar32         c;
69
70    if (U_FAILURE(*status)) {
71        return TOK_ERROR;
72    }
73
74    /* Skip whitespace */
75    c = getNextChar(buf, TRUE, comment, status);
76
77    if (U_FAILURE(*status)) {
78        return TOK_ERROR;
79    }
80
81    *linenumber = lineCount;
82
83    switch(c) {
84    case BADBOM:
85        return TOK_ERROR;
86    case OPENBRACE:
87        return TOK_OPEN_BRACE;
88    case CLOSEBRACE:
89        return TOK_CLOSE_BRACE;
90    case COMMA:
91        return TOK_COMMA;
92    case U_EOF:
93        return TOK_EOF;
94    case COLON:
95        return TOK_COLON;
96
97    default:
98        result = getStringToken(buf, c, token, status);
99    }
100
101    *linenumber = lineCount;
102    return result;
103}
104
105/* Copy a string token into the given UnicodeString.  Upon entry, we
106   have already read the first character of the string token, which is
107   not a whitespace character (but may be a QUOTE or ESCAPE). This
108   function reads all subsequent characters that belong with this
109   string, and copy them into the token parameter. The other
110   important, and slightly convoluted purpose of this function is to
111   merge adjacent strings.  It looks forward a bit, and if the next
112   non comment, non whitespace item is a string, it reads it in as
113   well.  If two adjacent strings are quoted, they are merged without
114   intervening space.  Otherwise a single SPACE character is
115   inserted. */
116static enum ETokenType getStringToken(UCHARBUF* buf,
117                                      UChar32 initialChar,
118                                      struct UString *token,
119                                      UErrorCode *status) {
120    UBool    lastStringWasQuoted;
121    UChar32  c;
122    UChar    target[3] = { '\0' };
123    UChar    *pTarget   = target;
124    int      len=0;
125    UBool    isFollowingCharEscaped=FALSE;
126    UBool    isNLUnescaped = FALSE;
127    UChar32  prevC=0;
128
129    /* We are guaranteed on entry that initialChar is not a whitespace
130       character. If we are at the EOF, or have some other problem, it
131       doesn't matter; we still want to validly return the initialChar
132       (if nothing else) as a string token. */
133
134    if (U_FAILURE(*status)) {
135        return TOK_ERROR;
136    }
137
138    /* setup */
139    lastStringWasQuoted = FALSE;
140    c = initialChar;
141    ustr_setlen(token, 0, status);
142
143    if (U_FAILURE(*status)) {
144        return TOK_ERROR;
145    }
146
147    for (;;) {
148        if (c == QUOTE) {
149            if (!lastStringWasQuoted && token->fLength > 0) {
150                ustr_ucat(token, SPACE, status);
151
152                if (U_FAILURE(*status)) {
153                    return TOK_ERROR;
154                }
155            }
156
157            lastStringWasQuoted = TRUE;
158
159            for (;;) {
160                c = ucbuf_getc(buf,status);
161
162                /* EOF reached */
163                if (c == U_EOF) {
164                    return TOK_EOF;
165                }
166
167                /* Unterminated quoted strings */
168                if (U_FAILURE(*status)) {
169                    return TOK_ERROR;
170                }
171
172                if (c == QUOTE && !isFollowingCharEscaped) {
173                    break;
174                }
175
176                if (c == ESCAPE  && !isFollowingCharEscaped) {
177                    pTarget = target;
178                    c       = unescape(buf, status);
179
180                    if (c == U_ERR) {
181                        return TOK_ERROR;
182                    }
183                    if(c == CR || c == LF){
184                        isNLUnescaped = TRUE;
185                    }
186                }
187
188                if(c==ESCAPE && !isFollowingCharEscaped){
189                    isFollowingCharEscaped = TRUE;
190                }else{
191                    U_APPEND_CHAR32(c, pTarget,len);
192                    pTarget = target;
193                    ustr_uscat(token, pTarget,len, status);
194                    isFollowingCharEscaped = FALSE;
195                    len=0;
196                    if(c == CR || c == LF){
197                        if(isNLUnescaped == FALSE && prevC!=CR){
198                            lineCount++;
199                        }
200                        isNLUnescaped = FALSE;
201                    }
202                }
203
204                if (U_FAILURE(*status)) {
205                    return TOK_ERROR;
206                }
207                prevC = c;
208            }
209        } else {
210            if (token->fLength > 0) {
211                ustr_ucat(token, SPACE, status);
212
213                if (U_FAILURE(*status)) {
214                    return TOK_ERROR;
215                }
216            }
217
218            if(lastStringWasQuoted){
219                if(getShowWarning()){
220                    warning(lineCount, "Mixing quoted and unquoted strings");
221                }
222                if(isStrict()){
223                    return TOK_ERROR;
224                }
225
226            }
227
228            lastStringWasQuoted = FALSE;
229
230            /* if we reach here we are mixing
231             * quoted and unquoted strings
232             * warn in normal mode and error in
233             * pedantic mode
234             */
235
236            if (c == ESCAPE) {
237                pTarget = target;
238                c       = unescape(buf, status);
239
240                /* EOF reached */
241                if (c == U_EOF) {
242                    return TOK_ERROR;
243                }
244            }
245
246            U_APPEND_CHAR32(c, pTarget,len);
247            pTarget = target;
248            ustr_uscat(token, pTarget,len, status);
249            len=0;
250
251            if (U_FAILURE(*status)) {
252                return TOK_ERROR;
253            }
254
255            for (;;) {
256                /* DON'T skip whitespace */
257                c = getNextChar(buf, FALSE, NULL, status);
258
259                /* EOF reached */
260                if (c == U_EOF) {
261                    ucbuf_ungetc(c, buf);
262                    return TOK_STRING;
263                }
264
265                if (U_FAILURE(*status)) {
266                    return TOK_STRING;
267                }
268
269                if (c == QUOTE
270                        || c == OPENBRACE
271                        || c == CLOSEBRACE
272                        || c == COMMA
273                        || c == COLON) {
274                    ucbuf_ungetc(c, buf);
275                    break;
276                }
277
278                if (isWhitespace(c)) {
279                    break;
280                }
281
282                if (c == ESCAPE) {
283                    pTarget = target;
284                    c       = unescape(buf, status);
285
286                    if (c == U_ERR) {
287                        return TOK_ERROR;
288                    }
289                }
290
291                U_APPEND_CHAR32(c, pTarget,len);
292                pTarget = target;
293                ustr_uscat(token, pTarget,len, status);
294                len=0;
295                if (U_FAILURE(*status)) {
296                    return TOK_ERROR;
297                }
298            }
299        }
300
301        /* DO skip whitespace */
302        c = getNextChar(buf, TRUE, NULL, status);
303
304        if (U_FAILURE(*status)) {
305            return TOK_STRING;
306        }
307
308        if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
309            ucbuf_ungetc(c, buf);
310            return TOK_STRING;
311        }
312    }
313}
314
315/* Retrieve the next character.  If skipwhite is
316   true, whitespace is skipped as well. */
317static UChar32 getNextChar(UCHARBUF* buf,
318                           UBool skipwhite,
319                           struct UString *token,
320                           UErrorCode *status) {
321    UChar32 c, c2;
322
323    if (U_FAILURE(*status)) {
324        return U_EOF;
325    }
326
327    for (;;) {
328        c = ucbuf_getc(buf,status);
329
330        if (c == U_EOF) {
331            return U_EOF;
332        }
333
334        if (skipwhite && isWhitespace(c)) {
335            continue;
336        }
337
338        /* This also handles the get() failing case */
339        if (c != SLASH) {
340            return c;
341        }
342
343        c = ucbuf_getc(buf,status); /* "/c" */
344
345        if (c == U_EOF) {
346            return U_EOF;
347        }
348
349        switch (c) {
350        case SLASH:  /* "//" */
351            seekUntilNewline(buf, NULL, status);
352            break;
353
354        case ASTERISK:  /* " / * " */
355            c2 = ucbuf_getc(buf, status); /* "/ * c" */
356            if(c2 == ASTERISK){  /* "/ * *" */
357                /* parse multi-line comment and store it in token*/
358                seekUntilEndOfComment(buf, token, status);
359            } else {
360                ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
361                seekUntilEndOfComment(buf, NULL, status);
362            }
363            break;
364
365        default:
366            ucbuf_ungetc(c, buf); /* "/c" - put back the c */
367            /* If get() failed this is a NOP */
368            return SLASH;
369        }
370
371    }
372}
373
374static void seekUntilNewline(UCHARBUF* buf,
375                             struct UString *token,
376                             UErrorCode *status) {
377    UChar32 c;
378
379    if (U_FAILURE(*status)) {
380        return;
381    }
382
383    do {
384        c = ucbuf_getc(buf,status);
385        /* add the char to token */
386        if(token!=NULL){
387            ustr_u32cat(token, c, status);
388        }
389    } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
390}
391
392static void seekUntilEndOfComment(UCHARBUF *buf,
393                                  struct UString *token,
394                                  UErrorCode *status) {
395    UChar32  c, d;
396    uint32_t line;
397
398    if (U_FAILURE(*status)) {
399        return;
400    }
401
402    line = lineCount;
403
404    do {
405        c = ucbuf_getc(buf, status);
406
407        if (c == ASTERISK) {
408            d = ucbuf_getc(buf, status);
409
410            if (d != SLASH) {
411                ucbuf_ungetc(d, buf);
412            } else {
413                break;
414            }
415        }
416        /* add the char to token */
417        if(token!=NULL){
418            ustr_u32cat(token, c, status);
419        }
420        /* increment the lineCount */
421        isNewline(c);
422
423    } while (c != U_EOF && *status == U_ZERO_ERROR);
424
425    if (c == U_EOF) {
426        *status = U_INVALID_FORMAT_ERROR;
427        error(line, "unterminated comment detected");
428    }
429}
430
431UChar32 unescape(UCHARBUF *buf,
432                 UErrorCode *status) {
433    if (U_FAILURE(*status)) {
434        return U_EOF;
435    }
436
437    /* We expect to be called after the ESCAPE has been seen, but
438     * u_fgetcx needs an ESCAPE to do its magic. */
439    ucbuf_ungetc(ESCAPE, buf);
440
441    return ucbuf_getcx32(buf, status);
442}
443
444static UBool isWhitespace(UChar32 c) {
445    switch (c) {
446        /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
447    case 0x000A:
448    case 0x2029:
449        lineCount++;
450    case 0x000D:
451    case 0x0020:
452    case 0x0009:
453    case 0xFEFF:
454        return TRUE;
455
456    default:
457        return FALSE;
458    }
459}
460
461static UBool isNewline(UChar32 c) {
462    switch (c) {
463        /* '\n', '\r', 0x2029 */
464    case 0x000A:
465    case 0x2029:
466        lineCount++;
467    case 0x000D:
468        return TRUE;
469
470    default:
471        return FALSE;
472    }
473}
474