1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (c) 2004-2011, International Business Machines
6* Corporation and others.  All Rights Reserved.
7**********************************************************************
8* Author: Alan Liu
9* Created: March 22 2004
10* Since: ICU 3.0
11**********************************************************************
12*/
13#include "tokiter.h"
14#include "textfile.h"
15#include "patternprops.h"
16#include "util.h"
17#include "uprops.h"
18
19TokenIterator::TokenIterator(TextFile* r) {
20    reader = r;
21    done = haveLine = FALSE;
22    pos = lastpos = -1;
23}
24
25TokenIterator::~TokenIterator() {
26}
27
28UBool TokenIterator::next(UnicodeString& token, UErrorCode& ec) {
29    if (done || U_FAILURE(ec)) {
30        return FALSE;
31    }
32    token.truncate(0);
33    for (;;) {
34        if (!haveLine) {
35            if (!reader->readLineSkippingComments(line, ec)) {
36                done = TRUE;
37                return FALSE;
38            }
39            haveLine = TRUE;
40            pos = 0;
41        }
42        lastpos = pos;
43        if (!nextToken(token, ec)) {
44            haveLine = FALSE;
45            if (U_FAILURE(ec)) return FALSE;
46            continue;
47        }
48        return TRUE;
49    }
50}
51
52int32_t TokenIterator::getLineNumber() const {
53    return reader->getLineNumber();
54}
55
56/**
57 * Read the next token from 'this->line' and append it to 'token'.
58 * Tokens are separated by Pattern_White_Space.  Tokens may also be
59 * delimited by double or single quotes.  The closing quote must match
60 * the opening quote.  If a '#' is encountered, the rest of the line
61 * is ignored, unless it is backslash-escaped or within quotes.
62 * @param token the token is appended to this StringBuffer
63 * @param ec input-output error code
64 * @return TRUE if a valid token is found, or FALSE if the end
65 * of the line is reached or an error occurs
66 */
67UBool TokenIterator::nextToken(UnicodeString& token, UErrorCode& ec) {
68    ICU_Utility::skipWhitespace(line, pos, TRUE);
69    if (pos == line.length()) {
70        return FALSE;
71    }
72    UChar c = line.charAt(pos++);
73    UChar quote = 0;
74    switch (c) {
75    case 34/*'"'*/:
76    case 39/*'\\'*/:
77        quote = c;
78        break;
79    case 35/*'#'*/:
80        return FALSE;
81    default:
82        token.append(c);
83        break;
84    }
85    while (pos < line.length()) {
86        c = line.charAt(pos); // 16-bit ok
87        if (c == 92/*'\\'*/) {
88            UChar32 c32 = line.unescapeAt(pos);
89            if (c32 < 0) {
90                ec = U_MALFORMED_UNICODE_ESCAPE;
91                return FALSE;
92            }
93            token.append(c32);
94        } else if ((quote != 0 && c == quote) ||
95                   (quote == 0 && PatternProps::isWhiteSpace(c))) {
96            ++pos;
97            return TRUE;
98        } else if (quote == 0 && c == '#') {
99            return TRUE; // do NOT increment
100        } else {
101            token.append(c);
102            ++pos;
103        }
104    }
105    if (quote != 0) {
106        ec = U_UNTERMINATED_QUOTE;
107        return FALSE;
108    }
109    return TRUE;
110}
111