uparse.c revision 51cfa1a9a96cad34675a6415fe86dfdf3f525bb6
1/*
2*******************************************************************************
3*
4*   Copyright (C) 2000-2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uparse.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2000apr18
14*   created by: Markus W. Scherer
15*
16*   This file provides a parser for files that are delimited by one single
17*   character like ';' or TAB. Example: the Unicode Character Properties files
18*   like UnicodeData.txt are semicolon-delimited.
19*/
20
21#include "unicode/utypes.h"
22#include "cstring.h"
23#include "filestrm.h"
24#include "uparse.h"
25#include "unicode/uchar.h"
26#include "unicode/ustring.h"
27#include "ustr_imp.h"
28
29#include <stdio.h>
30
31U_CAPI const char * U_EXPORT2
32u_skipWhitespace(const char *s) {
33    while(*s==' ' || *s=='\t') {
34        ++s;
35    }
36    return s;
37}
38
39/*
40 * If the string starts with # @missing: then return the pointer to the
41 * following non-whitespace character.
42 * Otherwise return the original pointer.
43 * Unicode 5.0 adds such lines in some data files to document
44 * default property values.
45 * Poor man's regex for variable amounts of white space.
46 */
47static const char *
48getMissingLimit(const char *s) {
49    const char *s0=s;
50    if(
51        *(s=u_skipWhitespace(s))=='#' &&
52        *(s=u_skipWhitespace(s+1))=='@' &&
53        0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
54        *(s=u_skipWhitespace(s+7))==':'
55    ) {
56        return u_skipWhitespace(s+1);
57    } else {
58        return s0;
59    }
60}
61
62U_CAPI void U_EXPORT2
63u_parseDelimitedFile(const char *filename, char delimiter,
64                     char *fields[][2], int32_t fieldCount,
65                     UParseLineFn *lineFn, void *context,
66                     UErrorCode *pErrorCode) {
67    FileStream *file;
68    char line[300];
69    char *start, *limit;
70    int32_t i, length;
71
72    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
73        return;
74    }
75
76    if(fields==NULL || lineFn==NULL || fieldCount<=0) {
77        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
78        return;
79    }
80
81    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
82        filename=NULL;
83        file=T_FileStream_stdin();
84    } else {
85        file=T_FileStream_open(filename, "r");
86    }
87    if(file==NULL) {
88        *pErrorCode=U_FILE_ACCESS_ERROR;
89        return;
90    }
91
92    while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
93        length=(int32_t)uprv_strlen(line);
94
95        /* remove trailing newline characters */
96        while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
97            line[--length]=0;
98        }
99
100        /*
101         * detect a line with # @missing:
102         * start parsing after that, or else from the beginning of the line
103         * set the default warning for @missing lines
104         */
105        start=(char *)getMissingLimit(line);
106        if(start==line) {
107            *pErrorCode=U_ZERO_ERROR;
108        } else {
109            *pErrorCode=U_USING_DEFAULT_WARNING;
110        }
111
112        /* skip this line if it is empty or a comment */
113        if(*start==0 || *start=='#') {
114            continue;
115        }
116
117        /* remove in-line comments */
118        limit=uprv_strchr(start, '#');
119        if(limit!=NULL) {
120            /* get white space before the pound sign */
121            while(limit>start && (*(limit-1)==' ' || *(limit-1)=='\t')) {
122                --limit;
123            }
124
125            /* truncate the line */
126            *limit=0;
127        }
128
129        /* skip lines with only whitespace */
130        if(u_skipWhitespace(start)[0]==0) {
131            continue;
132        }
133
134        /* for each field, call the corresponding field function */
135        for(i=0; i<fieldCount; ++i) {
136            /* set the limit pointer of this field */
137            limit=start;
138            while(*limit!=delimiter && *limit!=0) {
139                ++limit;
140            }
141
142            /* set the field start and limit in the fields array */
143            fields[i][0]=start;
144            fields[i][1]=limit;
145
146            /* set start to the beginning of the next field, if any */
147            start=limit;
148            if(*start!=0) {
149                ++start;
150            } else if(i+1<fieldCount) {
151                *pErrorCode=U_PARSE_ERROR;
152                limit=line+length;
153                i=fieldCount;
154                break;
155            }
156        }
157
158        /* error in a field function? */
159        if(U_FAILURE(*pErrorCode)) {
160            break;
161        }
162
163        /* call the field function */
164        lineFn(context, fields, fieldCount, pErrorCode);
165        if(U_FAILURE(*pErrorCode)) {
166            break;
167        }
168    }
169
170    if(filename!=NULL) {
171        T_FileStream_close(file);
172    }
173}
174
175/*
176 * parse a list of code points
177 * store them as a UTF-32 string in dest[destCapacity]
178 * return the number of code points
179 */
180U_CAPI int32_t U_EXPORT2
181u_parseCodePoints(const char *s,
182                  uint32_t *dest, int32_t destCapacity,
183                  UErrorCode *pErrorCode) {
184    char *end;
185    uint32_t value;
186    int32_t count;
187
188    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
189        return 0;
190    }
191    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
192        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
193        return 0;
194    }
195
196    count=0;
197    for(;;) {
198        s=u_skipWhitespace(s);
199        if(*s==';' || *s==0) {
200            return count;
201        }
202
203        /* read one code point */
204        value=(uint32_t)uprv_strtoul(s, &end, 16);
205        if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
206            *pErrorCode=U_PARSE_ERROR;
207            return 0;
208        }
209
210        /* append it to the destination array */
211        if(count<destCapacity) {
212            dest[count++]=value;
213        } else {
214            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
215        }
216
217        /* go to the following characters */
218        s=end;
219    }
220}
221
222/*
223 * parse a list of code points
224 * store them as a string in dest[destCapacity]
225 * set the first code point in *pFirst
226 * @return The length of the string in numbers of UChars.
227 */
228U_CAPI int32_t U_EXPORT2
229u_parseString(const char *s,
230              UChar *dest, int32_t destCapacity,
231              uint32_t *pFirst,
232              UErrorCode *pErrorCode) {
233    char *end;
234    uint32_t value;
235    int32_t destLength;
236
237    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
238        return 0;
239    }
240    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
241        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
242    }
243
244    if(pFirst!=NULL) {
245        *pFirst=0xffffffff;
246    }
247
248    destLength=0;
249    for(;;) {
250        s=u_skipWhitespace(s);
251        if(*s==';' || *s==0) {
252            if(destLength<destCapacity) {
253                dest[destLength]=0;
254            } else if(destLength==destCapacity) {
255                *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
256            } else {
257                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
258            }
259            return destLength;
260        }
261
262        /* read one code point */
263        value=(uint32_t)uprv_strtoul(s, &end, 16);
264        if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
265            *pErrorCode=U_PARSE_ERROR;
266            return 0;
267        }
268
269        /* store the first code point */
270        if(destLength==0 && pFirst!=NULL) {
271            *pFirst=value;
272        }
273
274        /* append it to the destination array */
275        if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
276            UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
277        } else {
278            destLength+=UTF_CHAR_LENGTH(value);
279        }
280
281        /* go to the following characters */
282        s=end;
283    }
284}
285
286/* read a range like start or start..end */
287U_CAPI int32_t U_EXPORT2
288u_parseCodePointRange(const char *s,
289                      uint32_t *pStart, uint32_t *pEnd,
290                      UErrorCode *pErrorCode) {
291    char *end;
292    uint32_t value;
293
294    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
295        return 0;
296    }
297    if(s==NULL || pStart==NULL || pEnd==NULL) {
298        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
299        return 0;
300    }
301
302    s=u_skipWhitespace(s);
303    if(*s==';' || *s==0) {
304        *pErrorCode=U_PARSE_ERROR;
305        return 0;
306    }
307
308    /* read the start code point */
309    value=(uint32_t)uprv_strtoul(s, &end, 16);
310    if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) {
311        *pErrorCode=U_PARSE_ERROR;
312        return 0;
313    }
314    *pStart=*pEnd=value;
315
316    /* is there a "..end"? */
317    s=u_skipWhitespace(end);
318    if(*s==';' || *s==0) {
319        return 1;
320    }
321
322    if(*s!='.' || s[1]!='.') {
323        *pErrorCode=U_PARSE_ERROR;
324        return 0;
325    }
326    s+=2;
327
328    /* read the end code point */
329    value=(uint32_t)uprv_strtoul(s, &end, 16);
330    if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
331        *pErrorCode=U_PARSE_ERROR;
332        return 0;
333    }
334    *pEnd=value;
335
336    /* is this a valid range? */
337    if(value<*pStart) {
338        *pErrorCode=U_PARSE_ERROR;
339        return 0;
340    }
341
342    /* no garbage after that? */
343    s=u_skipWhitespace(end);
344    if(*s==';' || *s==0) {
345        return value-*pStart+1;
346    } else {
347        *pErrorCode=U_PARSE_ERROR;
348        return 0;
349    }
350}
351
352U_CAPI int32_t U_EXPORT2
353u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
354    const char *read = source;
355    int32_t i = 0;
356    unsigned int value = 0;
357    if(sLen == -1) {
358        sLen = (int32_t)strlen(source);
359    }
360
361    while(read < source+sLen) {
362        sscanf(read, "%2x", &value);
363        if(i < destCapacity) {
364            dest[i] = (char)value;
365        }
366        i++;
367        read += 2;
368    }
369    return u_terminateChars(dest, destCapacity, i, status);
370}
371