1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Copyright (c) 2004-2011, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Author: Alan Liu
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Created: March 22 2004
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Since: ICU 3.0
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "tokiter.h"
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "textfile.h"
13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "patternprops.h"
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "util.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uprops.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTokenIterator::TokenIterator(TextFile* r) {
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    reader = r;
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    done = haveLine = FALSE;
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    pos = lastpos = -1;
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTokenIterator::~TokenIterator() {
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool TokenIterator::next(UnicodeString& token, UErrorCode& ec) {
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (done || U_FAILURE(ec)) {
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    token.truncate(0);
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (;;) {
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (!haveLine) {
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (!reader->readLineSkippingComments(line, ec)) {
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                done = TRUE;
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return FALSE;
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            haveLine = TRUE;
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            pos = 0;
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        lastpos = pos;
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (!nextToken(token, ec)) {
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            haveLine = FALSE;
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (U_FAILURE(ec)) return FALSE;
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            continue;
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return TRUE;
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t TokenIterator::getLineNumber() const {
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return reader->getLineNumber();
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Read the next token from 'this->line' and append it to 'token'.
56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Tokens are separated by Pattern_White_Space.  Tokens may also be
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * delimited by double or single quotes.  The closing quote must match
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the opening quote.  If a '#' is encountered, the rest of the line
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * is ignored, unless it is backslash-escaped or within quotes.
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param token the token is appended to this StringBuffer
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param ec input-output error code
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return TRUE if a valid token is found, or FALSE if the end
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the line is reached or an error occurs
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool TokenIterator::nextToken(UnicodeString& token, UErrorCode& ec) {
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ICU_Utility::skipWhitespace(line, pos, TRUE);
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (pos == line.length()) {
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c = line.charAt(pos++);
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar quote = 0;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    switch (c) {
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    case 34/*'"'*/:
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    case 39/*'\\'*/:
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        quote = c;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        break;
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    case 35/*'#'*/:
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    default:
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        token.append(c);
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        break;
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (pos < line.length()) {
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        c = line.charAt(pos); // 16-bit ok
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (c == 92/*'\\'*/) {
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            UChar32 c32 = line.unescapeAt(pos);
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (c32 < 0) {
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ec = U_MALFORMED_UNICODE_ESCAPE;
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return FALSE;
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            token.append(c32);
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if ((quote != 0 && c == quote) ||
93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                   (quote == 0 && PatternProps::isWhiteSpace(c))) {
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ++pos;
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return TRUE;
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if (quote == 0 && c == '#') {
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return TRUE; // do NOT increment
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            token.append(c);
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ++pos;
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (quote != 0) {
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ec = U_UNTERMINATED_QUOTE;
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
109