1/*
2**********************************************************************
3*   Copyright (c) 2001-2008, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   11/19/2001  aliu        Creation.
8**********************************************************************
9*/
10
11#include "util.h"
12#include "unicode/unimatch.h"
13#include "unicode/uniset.h"
14
15// Define UChar constants using hex for EBCDIC compatibility
16
17static const UChar BACKSLASH  = 0x005C; /*\*/
18static const UChar UPPER_U    = 0x0055; /*U*/
19static const UChar LOWER_U    = 0x0075; /*u*/
20static const UChar APOSTROPHE = 0x0027; // '\''
21static const UChar SPACE      = 0x0020; // ' '
22
23// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
24static const UChar DIGITS[] = {
25    48,49,50,51,52,53,54,55,56,57,
26    65,66,67,68,69,70,71,72,73,74,
27    75,76,77,78,79,80,81,82,83,84,
28    85,86,87,88,89,90
29};
30
31U_NAMESPACE_BEGIN
32
33UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
34                                     int32_t radix, int32_t minDigits) {
35    if (radix < 2 || radix > 36) {
36        // Bogus radix
37        return result.append((UChar)63/*?*/);
38    }
39    // Handle negatives
40    if (n < 0) {
41        n = -n;
42        result.append((UChar)45/*-*/);
43    }
44    // First determine the number of digits
45    int32_t nn = n;
46    int32_t r = 1;
47    while (nn >= radix) {
48        nn /= radix;
49        r *= radix;
50        --minDigits;
51    }
52    // Now generate the digits
53    while (--minDigits > 0) {
54        result.append(DIGITS[0]);
55    }
56    while (r > 0) {
57        int32_t digit = n / r;
58        result.append(DIGITS[digit]);
59        n -= digit * r;
60        r /= radix;
61    }
62    return result;
63}
64
65/**
66 * Return true if the character is NOT printable ASCII.
67 */
68UBool ICU_Utility::isUnprintable(UChar32 c) {
69    return !(c >= 0x20 && c <= 0x7E);
70}
71
72/**
73 * Escape unprintable characters using \uxxxx notation for U+0000 to
74 * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
75 * printable ASCII, then do nothing and return FALSE.  Otherwise,
76 * append the escaped notation and return TRUE.
77 */
78UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
79    if (isUnprintable(c)) {
80        result.append(BACKSLASH);
81        if (c & ~0xFFFF) {
82            result.append(UPPER_U);
83            result.append(DIGITS[0xF&(c>>28)]);
84            result.append(DIGITS[0xF&(c>>24)]);
85            result.append(DIGITS[0xF&(c>>20)]);
86            result.append(DIGITS[0xF&(c>>16)]);
87        } else {
88            result.append(LOWER_U);
89        }
90        result.append(DIGITS[0xF&(c>>12)]);
91        result.append(DIGITS[0xF&(c>>8)]);
92        result.append(DIGITS[0xF&(c>>4)]);
93        result.append(DIGITS[0xF&c]);
94        return TRUE;
95    }
96    return FALSE;
97}
98
99/**
100 * Returns the index of a character, ignoring quoted text.
101 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
102 * found by a search for 'h'.
103 */
104// FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
105/*
106int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
107                               int32_t start, int32_t limit,
108                               UChar charToFind) {
109    for (int32_t i=start; i<limit; ++i) {
110        UChar c = text.charAt(i);
111        if (c == BACKSLASH) {
112            ++i;
113        } else if (c == APOSTROPHE) {
114            while (++i < limit
115                   && text.charAt(i) != APOSTROPHE) {}
116        } else if (c == charToFind) {
117            return i;
118        }
119    }
120    return -1;
121}
122*/
123
124/**
125 * Skip over a sequence of zero or more white space characters at pos.
126 * @param advance if true, advance pos to the first non-white-space
127 * character at or after pos, or str.length(), if there is none.
128 * Otherwise leave pos unchanged.
129 * @return the index of the first non-white-space character at or
130 * after pos, or str.length(), if there is none.
131 */
132int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
133                                    UBool advance) {
134    int32_t p = pos;
135    while (p < str.length()) {
136        UChar32 c = str.char32At(p);
137        if (!uprv_isRuleWhiteSpace(c)) {
138            break;
139        }
140        p += UTF_CHAR_LENGTH(c);
141    }
142    if (advance) {
143        pos = p;
144    }
145    return p;
146}
147
148/**
149 * Skip over whitespace in a Replaceable.  Whitespace is defined by
150 * uprv_isRuleWhiteSpace().  Skipping may be done in the forward or
151 * reverse direction.  In either case, the leftmost index will be
152 * inclusive, and the rightmost index will be exclusive.  That is,
153 * given a range defined as [start, limit), the call
154 * skipWhitespace(text, start, limit) will advance start past leading
155 * whitespace, whereas the call skipWhitespace(text, limit, start),
156 * will back up limit past trailing whitespace.
157 * @param text the text to be analyzed
158 * @param pos either the start or limit of a range of 'text', to skip
159 * leading or trailing whitespace, respectively
160 * @param stop either the limit or start of a range of 'text', to skip
161 * leading or trailing whitespace, respectively
162 * @return the new start or limit, depending on what was passed in to
163 * 'pos'
164 */
165//?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
166//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
167//?                                    int32_t pos, int32_t stop) {
168//?    UChar32 c;
169//?    UBool isForward = (stop >= pos);
170//?
171//?    if (!isForward) {
172//?        --pos; // pos is a limit, so back up by one
173//?    }
174//?
175//?    while (pos != stop &&
176//?           uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
177//?        if (isForward) {
178//?            pos += UTF_CHAR_LENGTH(c);
179//?        } else {
180//?            pos -= UTF_CHAR_LENGTH(c);
181//?        }
182//?    }
183//?
184//?    if (!isForward) {
185//?        ++pos; // make pos back into a limit
186//?    }
187//?
188//?    return pos;
189//?}
190
191/**
192 * Parse a single non-whitespace character 'ch', optionally
193 * preceded by whitespace.
194 * @param id the string to be parsed
195 * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
196 * offset of the first character to be parsed.  On output, pos[0]
197 * is the index after the last parsed character.  If the parse
198 * fails, pos[0] will be unchanged.
199 * @param ch the non-whitespace character to be parsed.
200 * @return true if 'ch' is seen preceded by zero or more
201 * whitespace characters.
202 */
203UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
204    int32_t start = pos;
205    skipWhitespace(id, pos, TRUE);
206    if (pos == id.length() ||
207        id.charAt(pos) != ch) {
208        pos = start;
209        return FALSE;
210    }
211    ++pos;
212    return TRUE;
213}
214
215/**
216 * Parse a pattern string within the given Replaceable and a parsing
217 * pattern.  Characters are matched literally and case-sensitively
218 * except for the following special characters:
219 *
220 * ~  zero or more uprv_isRuleWhiteSpace chars
221 *
222 * If end of pattern is reached with all matches along the way,
223 * pos is advanced to the first unparsed index and returned.
224 * Otherwise -1 is returned.
225 * @param pat pattern that controls parsing
226 * @param text text to be parsed, starting at index
227 * @param index offset to first character to parse
228 * @param limit offset after last character to parse
229 * @return index after last parsed character, or -1 on parse failure.
230 */
231int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
232                                  const Replaceable& text,
233                                  int32_t index,
234                                  int32_t limit) {
235    int32_t ipat = 0;
236
237    // empty pattern matches immediately
238    if (ipat == pat.length()) {
239        return index;
240    }
241
242    UChar32 cpat = pat.char32At(ipat);
243
244    while (index < limit) {
245        UChar32 c = text.char32At(index);
246
247        // parse \s*
248        if (cpat == 126 /*~*/) {
249            if (uprv_isRuleWhiteSpace(c)) {
250                index += UTF_CHAR_LENGTH(c);
251                continue;
252            } else {
253                if (++ipat == pat.length()) {
254                    return index; // success; c unparsed
255                }
256                // fall thru; process c again with next cpat
257            }
258        }
259
260        // parse literal
261        else if (c == cpat) {
262            index += UTF_CHAR_LENGTH(c);
263            ipat += UTF_CHAR_LENGTH(cpat);
264            if (ipat == pat.length()) {
265                return index; // success; c parsed
266            }
267            // fall thru; get next cpat
268        }
269
270        // match failure of literal
271        else {
272            return -1;
273        }
274
275        cpat = pat.char32At(ipat);
276    }
277
278    return -1; // text ended before end of pat
279}
280
281/**
282 * Append a character to a rule that is being built up.  To flush
283 * the quoteBuf to rule, make one final call with isLiteral == TRUE.
284 * If there is no final character, pass in (UChar32)-1 as c.
285 * @param rule the string to append the character to
286 * @param c the character to append, or (UChar32)-1 if none.
287 * @param isLiteral if true, then the given character should not be
288 * quoted or escaped.  Usually this means it is a syntactic element
289 * such as > or $
290 * @param escapeUnprintable if true, then unprintable characters
291 * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
292 * appear outside of quotes.
293 * @param quoteBuf a buffer which is used to build up quoted
294 * substrings.  The caller should initially supply an empty buffer,
295 * and thereafter should not modify the buffer.  The buffer should be
296 * cleared out by, at the end, calling this method with a literal
297 * character.
298 */
299void ICU_Utility::appendToRule(UnicodeString& rule,
300                               UChar32 c,
301                               UBool isLiteral,
302                               UBool escapeUnprintable,
303                               UnicodeString& quoteBuf) {
304    // If we are escaping unprintables, then escape them outside
305    // quotes.  \u and \U are not recognized within quotes.  The same
306    // logic applies to literals, but literals are never escaped.
307    if (isLiteral ||
308        (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
309        if (quoteBuf.length() > 0) {
310            // We prefer backslash APOSTROPHE to double APOSTROPHE
311            // (more readable, less similar to ") so if there are
312            // double APOSTROPHEs at the ends, we pull them outside
313            // of the quote.
314
315            // If the first thing in the quoteBuf is APOSTROPHE
316            // (doubled) then pull it out.
317            while (quoteBuf.length() >= 2 &&
318                   quoteBuf.charAt(0) == APOSTROPHE &&
319                   quoteBuf.charAt(1) == APOSTROPHE) {
320                rule.append(BACKSLASH).append(APOSTROPHE);
321                quoteBuf.remove(0, 2);
322            }
323            // If the last thing in the quoteBuf is APOSTROPHE
324            // (doubled) then remove and count it and add it after.
325            int32_t trailingCount = 0;
326            while (quoteBuf.length() >= 2 &&
327                   quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
328                   quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
329                quoteBuf.truncate(quoteBuf.length()-2);
330                ++trailingCount;
331            }
332            if (quoteBuf.length() > 0) {
333                rule.append(APOSTROPHE);
334                rule.append(quoteBuf);
335                rule.append(APOSTROPHE);
336                quoteBuf.truncate(0);
337            }
338            while (trailingCount-- > 0) {
339                rule.append(BACKSLASH).append(APOSTROPHE);
340            }
341        }
342        if (c != (UChar32)-1) {
343            /* Since spaces are ignored during parsing, they are
344             * emitted only for readability.  We emit one here
345             * only if there isn't already one at the end of the
346             * rule.
347             */
348            if (c == SPACE) {
349                int32_t len = rule.length();
350                if (len > 0 && rule.charAt(len-1) != c) {
351                    rule.append(c);
352                }
353            } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
354                rule.append(c);
355            }
356        }
357    }
358
359    // Escape ' and '\' and don't begin a quote just for them
360    else if (quoteBuf.length() == 0 &&
361             (c == APOSTROPHE || c == BACKSLASH)) {
362        rule.append(BACKSLASH);
363        rule.append(c);
364    }
365
366    // Specials (printable ascii that isn't [0-9a-zA-Z]) and
367    // whitespace need quoting.  Also append stuff to quotes if we are
368    // building up a quoted substring already.
369    else if (quoteBuf.length() > 0 ||
370             (c >= 0x0021 && c <= 0x007E &&
371              !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
372                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
373                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
374             uprv_isRuleWhiteSpace(c)) {
375        quoteBuf.append(c);
376        // Double ' within a quote
377        if (c == APOSTROPHE) {
378            quoteBuf.append(c);
379        }
380    }
381
382    // Otherwise just append
383    else {
384        rule.append(c);
385    }
386}
387
388void ICU_Utility::appendToRule(UnicodeString& rule,
389                               const UnicodeString& text,
390                               UBool isLiteral,
391                               UBool escapeUnprintable,
392                               UnicodeString& quoteBuf) {
393    for (int32_t i=0; i<text.length(); ++i) {
394        appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
395    }
396}
397
398/**
399 * Given a matcher reference, which may be null, append its
400 * pattern as a literal to the given rule.
401 */
402void ICU_Utility::appendToRule(UnicodeString& rule,
403                               const UnicodeMatcher* matcher,
404                               UBool escapeUnprintable,
405                               UnicodeString& quoteBuf) {
406    if (matcher != NULL) {
407        UnicodeString pat;
408        appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
409                     TRUE, escapeUnprintable, quoteBuf);
410    }
411}
412
413U_NAMESPACE_END
414
415U_CAPI UBool U_EXPORT2
416uprv_isRuleWhiteSpace(UChar32 c) {
417    /* "white space" in the sense of ICU rule parsers
418       This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
419       See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
420       U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
421       Equivalent to test for Pattern_White_Space Unicode property.
422    */
423    return (c >= 0x0009 && c <= 0x2029 &&
424            (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
425             c == 0x200E || c == 0x200F || c >= 0x2028));
426}
427
428U_CAPI U_NAMESPACE_QUALIFIER UnicodeSet* U_EXPORT2
429uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
430    if(U_FAILURE(*ec)) {
431        return NULL;
432    }
433    // create a set with the Pattern_White_Space characters,
434    // without a pattern for fewer code dependencies
435    U_NAMESPACE_QUALIFIER UnicodeSet *set=new U_NAMESPACE_QUALIFIER UnicodeSet(9, 0xd);
436    // Check for new failure.
437    if (set == NULL) {
438        *ec = U_MEMORY_ALLOCATION_ERROR;
439        return NULL;
440    }
441    set->UnicodeSet::add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
442    return set;
443}
444
445//eof
446