1/*
2*******************************************************************************
3*
4*   Copyright (C) 2000-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uparse.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2000apr18
14*   created by: Markus W. Scherer
15*
16*   This file provides a parser for files that are delimited by one single
17*   character like ';' or TAB. Example: the Unicode Character Properties files
18*   like UnicodeData.txt are semicolon-delimited.
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uchar.h"
23#include "unicode/ustring.h"
24#include "unicode/utf16.h"
25#include "cstring.h"
26#include "filestrm.h"
27#include "uparse.h"
28#include "ustr_imp.h"
29
30#include <stdio.h>
31
32U_CAPI const char * U_EXPORT2
33u_skipWhitespace(const char *s) {
34    while(U_IS_INV_WHITESPACE(*s)) {
35        ++s;
36    }
37    return s;
38}
39
40U_CAPI char * U_EXPORT2
41u_rtrim(char *s) {
42    char *end=uprv_strchr(s, 0);
43    while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
44        *--end = 0;
45    }
46    return end;
47}
48
49/*
50 * If the string starts with # @missing: then return the pointer to the
51 * following non-whitespace character.
52 * Otherwise return the original pointer.
53 * Unicode 5.0 adds such lines in some data files to document
54 * default property values.
55 * Poor man's regex for variable amounts of white space.
56 */
57static const char *
58getMissingLimit(const char *s) {
59    const char *s0=s;
60    if(
61        *(s=u_skipWhitespace(s))=='#' &&
62        *(s=u_skipWhitespace(s+1))=='@' &&
63        0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
64        *(s=u_skipWhitespace(s+7))==':'
65    ) {
66        return u_skipWhitespace(s+1);
67    } else {
68        return s0;
69    }
70}
71
72U_CAPI void U_EXPORT2
73u_parseDelimitedFile(const char *filename, char delimiter,
74                     char *fields[][2], int32_t fieldCount,
75                     UParseLineFn *lineFn, void *context,
76                     UErrorCode *pErrorCode) {
77    FileStream *file;
78    char line[300];
79    char *start, *limit;
80    int32_t i, length;
81
82    if(U_FAILURE(*pErrorCode)) {
83        return;
84    }
85
86    if(fields==NULL || lineFn==NULL || fieldCount<=0) {
87        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
88        return;
89    }
90
91    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
92        filename=NULL;
93        file=T_FileStream_stdin();
94    } else {
95        file=T_FileStream_open(filename, "r");
96    }
97    if(file==NULL) {
98        *pErrorCode=U_FILE_ACCESS_ERROR;
99        return;
100    }
101
102    while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
103        /* remove trailing newline characters */
104        length=(int32_t)(u_rtrim(line)-line);
105
106        /*
107         * detect a line with # @missing:
108         * start parsing after that, or else from the beginning of the line
109         * set the default warning for @missing lines
110         */
111        start=(char *)getMissingLimit(line);
112        if(start==line) {
113            *pErrorCode=U_ZERO_ERROR;
114        } else {
115            *pErrorCode=U_USING_DEFAULT_WARNING;
116        }
117
118        /* skip this line if it is empty or a comment */
119        if(*start==0 || *start=='#') {
120            continue;
121        }
122
123        /* remove in-line comments */
124        limit=uprv_strchr(start, '#');
125        if(limit!=NULL) {
126            /* get white space before the pound sign */
127            while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
128                --limit;
129            }
130
131            /* truncate the line */
132            *limit=0;
133        }
134
135        /* skip lines with only whitespace */
136        if(u_skipWhitespace(start)[0]==0) {
137            continue;
138        }
139
140        /* for each field, call the corresponding field function */
141        for(i=0; i<fieldCount; ++i) {
142            /* set the limit pointer of this field */
143            limit=start;
144            while(*limit!=delimiter && *limit!=0) {
145                ++limit;
146            }
147
148            /* set the field start and limit in the fields array */
149            fields[i][0]=start;
150            fields[i][1]=limit;
151
152            /* set start to the beginning of the next field, if any */
153            start=limit;
154            if(*start!=0) {
155                ++start;
156            } else if(i+1<fieldCount) {
157                *pErrorCode=U_PARSE_ERROR;
158                limit=line+length;
159                i=fieldCount;
160                break;
161            }
162        }
163
164        /* error in a field function? */
165        if(U_FAILURE(*pErrorCode)) {
166            break;
167        }
168
169        /* call the field function */
170        lineFn(context, fields, fieldCount, pErrorCode);
171        if(U_FAILURE(*pErrorCode)) {
172            break;
173        }
174    }
175
176    if(filename!=NULL) {
177        T_FileStream_close(file);
178    }
179}
180
181/*
182 * parse a list of code points
183 * store them as a UTF-32 string in dest[destCapacity]
184 * return the number of code points
185 */
186U_CAPI int32_t U_EXPORT2
187u_parseCodePoints(const char *s,
188                  uint32_t *dest, int32_t destCapacity,
189                  UErrorCode *pErrorCode) {
190    char *end;
191    uint32_t value;
192    int32_t count;
193
194    if(U_FAILURE(*pErrorCode)) {
195        return 0;
196    }
197    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
198        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
199        return 0;
200    }
201
202    count=0;
203    for(;;) {
204        s=u_skipWhitespace(s);
205        if(*s==';' || *s==0) {
206            return count;
207        }
208
209        /* read one code point */
210        value=(uint32_t)uprv_strtoul(s, &end, 16);
211        if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
212            *pErrorCode=U_PARSE_ERROR;
213            return 0;
214        }
215
216        /* append it to the destination array */
217        if(count<destCapacity) {
218            dest[count++]=value;
219        } else {
220            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
221        }
222
223        /* go to the following characters */
224        s=end;
225    }
226}
227
228/*
229 * parse a list of code points
230 * store them as a string in dest[destCapacity]
231 * set the first code point in *pFirst
232 * @return The length of the string in numbers of UChars.
233 */
234U_CAPI int32_t U_EXPORT2
235u_parseString(const char *s,
236              UChar *dest, int32_t destCapacity,
237              uint32_t *pFirst,
238              UErrorCode *pErrorCode) {
239    char *end;
240    uint32_t value;
241    int32_t destLength;
242
243    if(U_FAILURE(*pErrorCode)) {
244        return 0;
245    }
246    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
247        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
248        return 0;
249    }
250
251    if(pFirst!=NULL) {
252        *pFirst=0xffffffff;
253    }
254
255    destLength=0;
256    for(;;) {
257        s=u_skipWhitespace(s);
258        if(*s==';' || *s==0) {
259            if(destLength<destCapacity) {
260                dest[destLength]=0;
261            } else if(destLength==destCapacity) {
262                *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
263            } else {
264                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265            }
266            return destLength;
267        }
268
269        /* read one code point */
270        value=(uint32_t)uprv_strtoul(s, &end, 16);
271        if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
272            *pErrorCode=U_PARSE_ERROR;
273            return 0;
274        }
275
276        /* store the first code point */
277        if(pFirst!=NULL) {
278            *pFirst=value;
279            pFirst=NULL;
280        }
281
282        /* append it to the destination array */
283        if((destLength+U16_LENGTH(value))<=destCapacity) {
284            U16_APPEND_UNSAFE(dest, destLength, value);
285        } else {
286            destLength+=U16_LENGTH(value);
287        }
288
289        /* go to the following characters */
290        s=end;
291    }
292}
293
294/* read a range like start or start..end */
295U_CAPI int32_t U_EXPORT2
296u_parseCodePointRangeAnyTerminator(const char *s,
297                                   uint32_t *pStart, uint32_t *pEnd,
298                                   const char **terminator,
299                                   UErrorCode *pErrorCode) {
300    char *end;
301    uint32_t value;
302
303    if(U_FAILURE(*pErrorCode)) {
304        return 0;
305    }
306    if(s==NULL || pStart==NULL || pEnd==NULL) {
307        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
308        return 0;
309    }
310
311    /* read the start code point */
312    s=u_skipWhitespace(s);
313    value=(uint32_t)uprv_strtoul(s, &end, 16);
314    if(end<=s || value>=0x110000) {
315        *pErrorCode=U_PARSE_ERROR;
316        return 0;
317    }
318    *pStart=*pEnd=value;
319
320    /* is there a "..end"? */
321    s=u_skipWhitespace(end);
322    if(*s!='.' || s[1]!='.') {
323        *terminator=end;
324        return 1;
325    }
326    s=u_skipWhitespace(s+2);
327
328    /* read the end code point */
329    value=(uint32_t)uprv_strtoul(s, &end, 16);
330    if(end<=s || value>=0x110000) {
331        *pErrorCode=U_PARSE_ERROR;
332        return 0;
333    }
334    *pEnd=value;
335
336    /* is this a valid range? */
337    if(value<*pStart) {
338        *pErrorCode=U_PARSE_ERROR;
339        return 0;
340    }
341
342    *terminator=end;
343    return value-*pStart+1;
344}
345
346U_CAPI int32_t U_EXPORT2
347u_parseCodePointRange(const char *s,
348                      uint32_t *pStart, uint32_t *pEnd,
349                      UErrorCode *pErrorCode) {
350    const char *terminator;
351    int32_t rangeLength=
352        u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
353    if(U_SUCCESS(*pErrorCode)) {
354        terminator=u_skipWhitespace(terminator);
355        if(*terminator!=';' && *terminator!=0) {
356            *pErrorCode=U_PARSE_ERROR;
357            return 0;
358        }
359    }
360    return rangeLength;
361}
362
363U_CAPI int32_t U_EXPORT2
364u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
365    const char *read = source;
366    int32_t i = 0;
367    unsigned int value = 0;
368    if(sLen == -1) {
369        sLen = (int32_t)strlen(source);
370    }
371
372    while(read < source+sLen) {
373        sscanf(read, "%2x", &value);
374        if(i < destCapacity) {
375            dest[i] = (char)value;
376        }
377        i++;
378        read += 2;
379    }
380    return u_terminateChars(dest, destCapacity, i, status);
381}
382