1/*
2**********************************************************************
3*   Copyright (c) 2001-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   11/19/2001  aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/unimatch.h"
12#include "unicode/utf16.h"
13#include "patternprops.h"
14#include "util.h"
15
16// Define UChar constants using hex for EBCDIC compatibility
17
18static const UChar BACKSLASH  = 0x005C; /*\*/
19static const UChar UPPER_U    = 0x0055; /*U*/
20static const UChar LOWER_U    = 0x0075; /*u*/
21static const UChar APOSTROPHE = 0x0027; // '\''
22static const UChar SPACE      = 0x0020; // ' '
23
24// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
25static const UChar DIGITS[] = {
26    48,49,50,51,52,53,54,55,56,57,
27    65,66,67,68,69,70,71,72,73,74,
28    75,76,77,78,79,80,81,82,83,84,
29    85,86,87,88,89,90
30};
31
32U_NAMESPACE_BEGIN
33
34UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
35                                     int32_t radix, int32_t minDigits) {
36    if (radix < 2 || radix > 36) {
37        // Bogus radix
38        return result.append((UChar)63/*?*/);
39    }
40    // Handle negatives
41    if (n < 0) {
42        n = -n;
43        result.append((UChar)45/*-*/);
44    }
45    // First determine the number of digits
46    int32_t nn = n;
47    int32_t r = 1;
48    while (nn >= radix) {
49        nn /= radix;
50        r *= radix;
51        --minDigits;
52    }
53    // Now generate the digits
54    while (--minDigits > 0) {
55        result.append(DIGITS[0]);
56    }
57    while (r > 0) {
58        int32_t digit = n / r;
59        result.append(DIGITS[digit]);
60        n -= digit * r;
61        r /= radix;
62    }
63    return result;
64}
65
66/**
67 * Return true if the character is NOT printable ASCII.
68 */
69UBool ICU_Utility::isUnprintable(UChar32 c) {
70    return !(c >= 0x20 && c <= 0x7E);
71}
72
73/**
74 * Escape unprintable characters using \uxxxx notation for U+0000 to
75 * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
76 * printable ASCII, then do nothing and return FALSE.  Otherwise,
77 * append the escaped notation and return TRUE.
78 */
79UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
80    if (isUnprintable(c)) {
81        result.append(BACKSLASH);
82        if (c & ~0xFFFF) {
83            result.append(UPPER_U);
84            result.append(DIGITS[0xF&(c>>28)]);
85            result.append(DIGITS[0xF&(c>>24)]);
86            result.append(DIGITS[0xF&(c>>20)]);
87            result.append(DIGITS[0xF&(c>>16)]);
88        } else {
89            result.append(LOWER_U);
90        }
91        result.append(DIGITS[0xF&(c>>12)]);
92        result.append(DIGITS[0xF&(c>>8)]);
93        result.append(DIGITS[0xF&(c>>4)]);
94        result.append(DIGITS[0xF&c]);
95        return TRUE;
96    }
97    return FALSE;
98}
99
100/**
101 * Returns the index of a character, ignoring quoted text.
102 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
103 * found by a search for 'h'.
104 */
105// FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
106/*
107int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
108                               int32_t start, int32_t limit,
109                               UChar charToFind) {
110    for (int32_t i=start; i<limit; ++i) {
111        UChar c = text.charAt(i);
112        if (c == BACKSLASH) {
113            ++i;
114        } else if (c == APOSTROPHE) {
115            while (++i < limit
116                   && text.charAt(i) != APOSTROPHE) {}
117        } else if (c == charToFind) {
118            return i;
119        }
120    }
121    return -1;
122}
123*/
124
125/**
126 * Skip over a sequence of zero or more white space characters at pos.
127 * @param advance if true, advance pos to the first non-white-space
128 * character at or after pos, or str.length(), if there is none.
129 * Otherwise leave pos unchanged.
130 * @return the index of the first non-white-space character at or
131 * after pos, or str.length(), if there is none.
132 */
133int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
134                                    UBool advance) {
135    int32_t p = pos;
136    const UChar* s = str.getBuffer();
137    p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);
138    if (advance) {
139        pos = p;
140    }
141    return p;
142}
143
144/**
145 * Skip over Pattern_White_Space in a Replaceable.
146 * Skipping may be done in the forward or
147 * reverse direction.  In either case, the leftmost index will be
148 * inclusive, and the rightmost index will be exclusive.  That is,
149 * given a range defined as [start, limit), the call
150 * skipWhitespace(text, start, limit) will advance start past leading
151 * whitespace, whereas the call skipWhitespace(text, limit, start),
152 * will back up limit past trailing whitespace.
153 * @param text the text to be analyzed
154 * @param pos either the start or limit of a range of 'text', to skip
155 * leading or trailing whitespace, respectively
156 * @param stop either the limit or start of a range of 'text', to skip
157 * leading or trailing whitespace, respectively
158 * @return the new start or limit, depending on what was passed in to
159 * 'pos'
160 */
161//?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
162//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
163//?                                    int32_t pos, int32_t stop) {
164//?    UChar32 c;
165//?    UBool isForward = (stop >= pos);
166//?
167//?    if (!isForward) {
168//?        --pos; // pos is a limit, so back up by one
169//?    }
170//?
171//?    while (pos != stop &&
172//?           PatternProps::isWhiteSpace(c = text.char32At(pos))) {
173//?        if (isForward) {
174//?            pos += U16_LENGTH(c);
175//?        } else {
176//?            pos -= U16_LENGTH(c);
177//?        }
178//?    }
179//?
180//?    if (!isForward) {
181//?        ++pos; // make pos back into a limit
182//?    }
183//?
184//?    return pos;
185//?}
186
187/**
188 * Parse a single non-whitespace character 'ch', optionally
189 * preceded by whitespace.
190 * @param id the string to be parsed
191 * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
192 * offset of the first character to be parsed.  On output, pos[0]
193 * is the index after the last parsed character.  If the parse
194 * fails, pos[0] will be unchanged.
195 * @param ch the non-whitespace character to be parsed.
196 * @return true if 'ch' is seen preceded by zero or more
197 * whitespace characters.
198 */
199UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
200    int32_t start = pos;
201    skipWhitespace(id, pos, TRUE);
202    if (pos == id.length() ||
203        id.charAt(pos) != ch) {
204        pos = start;
205        return FALSE;
206    }
207    ++pos;
208    return TRUE;
209}
210
211/**
212 * Parse a pattern string within the given Replaceable and a parsing
213 * pattern.  Characters are matched literally and case-sensitively
214 * except for the following special characters:
215 *
216 * ~  zero or more Pattern_White_Space chars
217 *
218 * If end of pattern is reached with all matches along the way,
219 * pos is advanced to the first unparsed index and returned.
220 * Otherwise -1 is returned.
221 * @param pat pattern that controls parsing
222 * @param text text to be parsed, starting at index
223 * @param index offset to first character to parse
224 * @param limit offset after last character to parse
225 * @return index after last parsed character, or -1 on parse failure.
226 */
227int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
228                                  const Replaceable& text,
229                                  int32_t index,
230                                  int32_t limit) {
231    int32_t ipat = 0;
232
233    // empty pattern matches immediately
234    if (ipat == pat.length()) {
235        return index;
236    }
237
238    UChar32 cpat = pat.char32At(ipat);
239
240    while (index < limit) {
241        UChar32 c = text.char32At(index);
242
243        // parse \s*
244        if (cpat == 126 /*~*/) {
245            if (PatternProps::isWhiteSpace(c)) {
246                index += U16_LENGTH(c);
247                continue;
248            } else {
249                if (++ipat == pat.length()) {
250                    return index; // success; c unparsed
251                }
252                // fall thru; process c again with next cpat
253            }
254        }
255
256        // parse literal
257        else if (c == cpat) {
258            index += U16_LENGTH(c);
259            ipat += U16_LENGTH(cpat);
260            if (ipat == pat.length()) {
261                return index; // success; c parsed
262            }
263            // fall thru; get next cpat
264        }
265
266        // match failure of literal
267        else {
268            return -1;
269        }
270
271        cpat = pat.char32At(ipat);
272    }
273
274    return -1; // text ended before end of pat
275}
276
277/**
278 * Append a character to a rule that is being built up.  To flush
279 * the quoteBuf to rule, make one final call with isLiteral == TRUE.
280 * If there is no final character, pass in (UChar32)-1 as c.
281 * @param rule the string to append the character to
282 * @param c the character to append, or (UChar32)-1 if none.
283 * @param isLiteral if true, then the given character should not be
284 * quoted or escaped.  Usually this means it is a syntactic element
285 * such as > or $
286 * @param escapeUnprintable if true, then unprintable characters
287 * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
288 * appear outside of quotes.
289 * @param quoteBuf a buffer which is used to build up quoted
290 * substrings.  The caller should initially supply an empty buffer,
291 * and thereafter should not modify the buffer.  The buffer should be
292 * cleared out by, at the end, calling this method with a literal
293 * character.
294 */
295void ICU_Utility::appendToRule(UnicodeString& rule,
296                               UChar32 c,
297                               UBool isLiteral,
298                               UBool escapeUnprintable,
299                               UnicodeString& quoteBuf) {
300    // If we are escaping unprintables, then escape them outside
301    // quotes.  \u and \U are not recognized within quotes.  The same
302    // logic applies to literals, but literals are never escaped.
303    if (isLiteral ||
304        (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
305        if (quoteBuf.length() > 0) {
306            // We prefer backslash APOSTROPHE to double APOSTROPHE
307            // (more readable, less similar to ") so if there are
308            // double APOSTROPHEs at the ends, we pull them outside
309            // of the quote.
310
311            // If the first thing in the quoteBuf is APOSTROPHE
312            // (doubled) then pull it out.
313            while (quoteBuf.length() >= 2 &&
314                   quoteBuf.charAt(0) == APOSTROPHE &&
315                   quoteBuf.charAt(1) == APOSTROPHE) {
316                rule.append(BACKSLASH).append(APOSTROPHE);
317                quoteBuf.remove(0, 2);
318            }
319            // If the last thing in the quoteBuf is APOSTROPHE
320            // (doubled) then remove and count it and add it after.
321            int32_t trailingCount = 0;
322            while (quoteBuf.length() >= 2 &&
323                   quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
324                   quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
325                quoteBuf.truncate(quoteBuf.length()-2);
326                ++trailingCount;
327            }
328            if (quoteBuf.length() > 0) {
329                rule.append(APOSTROPHE);
330                rule.append(quoteBuf);
331                rule.append(APOSTROPHE);
332                quoteBuf.truncate(0);
333            }
334            while (trailingCount-- > 0) {
335                rule.append(BACKSLASH).append(APOSTROPHE);
336            }
337        }
338        if (c != (UChar32)-1) {
339            /* Since spaces are ignored during parsing, they are
340             * emitted only for readability.  We emit one here
341             * only if there isn't already one at the end of the
342             * rule.
343             */
344            if (c == SPACE) {
345                int32_t len = rule.length();
346                if (len > 0 && rule.charAt(len-1) != c) {
347                    rule.append(c);
348                }
349            } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
350                rule.append(c);
351            }
352        }
353    }
354
355    // Escape ' and '\' and don't begin a quote just for them
356    else if (quoteBuf.length() == 0 &&
357             (c == APOSTROPHE || c == BACKSLASH)) {
358        rule.append(BACKSLASH);
359        rule.append(c);
360    }
361
362    // Specials (printable ascii that isn't [0-9a-zA-Z]) and
363    // whitespace need quoting.  Also append stuff to quotes if we are
364    // building up a quoted substring already.
365    else if (quoteBuf.length() > 0 ||
366             (c >= 0x0021 && c <= 0x007E &&
367              !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
368                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
369                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
370             PatternProps::isWhiteSpace(c)) {
371        quoteBuf.append(c);
372        // Double ' within a quote
373        if (c == APOSTROPHE) {
374            quoteBuf.append(c);
375        }
376    }
377
378    // Otherwise just append
379    else {
380        rule.append(c);
381    }
382}
383
384void ICU_Utility::appendToRule(UnicodeString& rule,
385                               const UnicodeString& text,
386                               UBool isLiteral,
387                               UBool escapeUnprintable,
388                               UnicodeString& quoteBuf) {
389    for (int32_t i=0; i<text.length(); ++i) {
390        appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
391    }
392}
393
394/**
395 * Given a matcher reference, which may be null, append its
396 * pattern as a literal to the given rule.
397 */
398void ICU_Utility::appendToRule(UnicodeString& rule,
399                               const UnicodeMatcher* matcher,
400                               UBool escapeUnprintable,
401                               UnicodeString& quoteBuf) {
402    if (matcher != NULL) {
403        UnicodeString pat;
404        appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
405                     TRUE, escapeUnprintable, quoteBuf);
406    }
407}
408
409U_NAMESPACE_END
410