1/*
2*******************************************************************************
3*
4*   Copyright (C) 2000-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uparse.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2000apr18
14*   created by: Markus W. Scherer
15*
16*   This file provides a parser for files that are delimited by one single
17*   character like ';' or TAB. Example: the Unicode Character Properties files
18*   like UnicodeData.txt are semicolon-delimited.
19*/
20
21#ifndef __UPARSE_H__
22#define __UPARSE_H__
23
24#include "unicode/utypes.h"
25
26/**
27 * Is c an invariant-character whitespace?
28 * @param c invariant character
29 */
30#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
31
32U_CDECL_BEGIN
33
34/**
35 * Skip space ' ' and TAB '\t' characters.
36 *
37 * @param s Pointer to characters.
38 * @return Pointer to first character at or after s that is not a space or TAB.
39 */
40U_CAPI const char * U_EXPORT2
41u_skipWhitespace(const char *s);
42
43/**
44 * Trim whitespace (including line endings) from the end of the string.
45 *
46 * @param s Pointer to the string.
47 * @return Pointer to the new end of the string.
48 */
49U_CAPI char * U_EXPORT2
50u_rtrim(char *s);
51
52/** Function type for u_parseDelimitedFile(). */
53typedef void U_CALLCONV
54UParseLineFn(void *context,
55              char *fields[][2],
56              int32_t fieldCount,
57              UErrorCode *pErrorCode);
58
59/**
60 * Parser for files that are similar to UnicodeData.txt:
61 * This function opens the file and reads it line by line. It skips empty lines
62 * and comment lines that start with a '#'.
63 * All other lines are separated into fields with one delimiter character
64 * (semicolon for Unicode Properties files) between two fields. The last field in
65 * a line does not need to be terminated with a delimiter.
66 *
67 * For each line, after segmenting it, a line function is called.
68 * It gets passed the array of field start and limit pointers that is
69 * passed into this parser and filled by it for each line.
70 * For each field i of the line, the start pointer in fields[i][0]
71 * points to the beginning of the field, while the limit pointer in fields[i][1]
72 * points behind the field, i.e., to the delimiter or the line end.
73 *
74 * The context parameter of the line function is
75 * the same as the one for the parse function.
76 *
77 * The line function may modify the contents of the fields including the
78 * limit characters.
79 *
80 * If the file cannot be opened, or there is a parsing error or a field function
81 * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
82 */
83U_CAPI void U_EXPORT2
84u_parseDelimitedFile(const char *filename, char delimiter,
85                     char *fields[][2], int32_t fieldCount,
86                     UParseLineFn *lineFn, void *context,
87                     UErrorCode *pErrorCode);
88
89/**
90 * Parse a string of code points like 0061 0308 0300.
91 * s must end with either ';' or NUL.
92 *
93 * @return Number of code points.
94 */
95U_CAPI int32_t U_EXPORT2
96u_parseCodePoints(const char *s,
97                  uint32_t *dest, int32_t destCapacity,
98                  UErrorCode *pErrorCode);
99
100/**
101 * Parse a list of code points like 0061 0308 0300
102 * into a UChar * string.
103 * s must end with either ';' or NUL.
104 *
105 * Set the first code point in *pFirst.
106 *
107 * @param s Input char * string.
108 * @param dest Output string buffer.
109 * @param destCapacity Capacity of dest in numbers of UChars.
110 * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
111 *               code point in the string.
112 * @param pErrorCode ICU error code.
113 * @return The length of the string in numbers of UChars.
114 */
115U_CAPI int32_t U_EXPORT2
116u_parseString(const char *s,
117              UChar *dest, int32_t destCapacity,
118              uint32_t *pFirst,
119              UErrorCode *pErrorCode);
120
121/**
122 * Parse a code point range like
123 * 0085 or
124 * 4E00..9FA5.
125 *
126 * s must contain such a range and end with either ';' or NUL.
127 *
128 * @return Length of code point range, end-start+1
129 */
130U_CAPI int32_t U_EXPORT2
131u_parseCodePointRange(const char *s,
132                      uint32_t *pStart, uint32_t *pEnd,
133                      UErrorCode *pErrorCode);
134
135/**
136 * Same as u_parseCodePointRange() but the range may be terminated by
137 * any character. The position of the terminating character is returned via
138 * the *terminator output parameter.
139 */
140U_CAPI int32_t U_EXPORT2
141u_parseCodePointRangeAnyTerminator(const char *s,
142                                   uint32_t *pStart, uint32_t *pEnd,
143                                   const char **terminator,
144                                   UErrorCode *pErrorCode);
145
146U_CAPI int32_t U_EXPORT2
147u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
148
149U_CDECL_END
150
151#endif
152