15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*******************************************************************************
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   Copyright (C) 2000-2012, International Business Machines
5a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)*   Corporation and others.  All Rights Reserved.
6a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)*
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*******************************************************************************
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   file name:  uparse.c
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   encoding:   US-ASCII
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   tab size:   8 (not used)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   indentation:4
127d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)*
13a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)*   created on: 2000apr18
14eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch*   created by: Markus W. Scherer
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*
16a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)*   This file provides a parser for files that are delimited by one single
17a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)*   character like ';' or TAB. Example: the Unicode Character Properties files
18bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch*   like UnicodeData.txt are semicolon-delimited.
19bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch*/
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/utypes.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/uchar.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/ustring.h"
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/utf16.h"
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "cstring.h"
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "filestrm.h"
27bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch#include "uparse.h"
28bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch#include "ustr_imp.h"
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
30bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch#include <stdio.h>
31a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)
32bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben MurdochU_CAPI const char * U_EXPORT2
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)u_skipWhitespace(const char *s) {
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while(U_IS_INV_WHITESPACE(*s)) {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ++s;
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return s;
387d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)}
397d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)
40c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)U_CAPI char * U_EXPORT2
41c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)u_rtrim(char *s) {
42c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    char *end=uprv_strchr(s, 0);
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
44a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        *--end = 0;
45a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)    }
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return end;
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
497d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)/*
507d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) * If the string starts with # @missing: then return the pointer to the
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * following non-whitespace character.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Otherwise return the original pointer.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Unicode 5.0 adds such lines in some data files to document
54f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) * default property values.
55f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) * Poor man's regex for variable amounts of white space.
56f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) */
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const char *
58bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben MurdochgetMissingLimit(const char *s) {
59bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    const char *s0=s;
60bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    if(
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *(s=u_skipWhitespace(s))=='#' &&
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *(s=u_skipWhitespace(s+1))=='@' &&
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *(s=u_skipWhitespace(s+7))==':'
65868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    ) {
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return u_skipWhitespace(s+1);
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return s0;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)U_CAPI void U_EXPORT2
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)u_parseDelimitedFile(const char *filename, char delimiter,
74c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)                     char *fields[][2], int32_t fieldCount,
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     UParseLineFn *lineFn, void *context,
76c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)                     UErrorCode *pErrorCode) {
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    FileStream *file;
78c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    char line[300];
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char *start, *limit;
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int32_t i, length;
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if(U_FAILURE(*pErrorCode)) {
83c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        return;
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if(fields==NULL || lineFn==NULL || fieldCount<=0) {
87bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
88bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch        return;
894e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)    }
904e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)
914e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
924e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)        filename=NULL;
934e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)        file=T_FileStream_stdin();
94bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    } else {
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        file=T_FileStream_open(filename, "r");
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
97a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)    if(file==NULL) {
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *pErrorCode=U_FILE_ACCESS_ERROR;
99a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        return;
100    }
101
102    while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
103        /* remove trailing newline characters */
104        length=(int32_t)(u_rtrim(line)-line);
105
106        /*
107         * detect a line with # @missing:
108         * start parsing after that, or else from the beginning of the line
109         * set the default warning for @missing lines
110         */
111        start=(char *)getMissingLimit(line);
112        if(start==line) {
113            *pErrorCode=U_ZERO_ERROR;
114        } else {
115            *pErrorCode=U_USING_DEFAULT_WARNING;
116        }
117
118        /* skip this line if it is empty or a comment */
119        if(*start==0 || *start=='#') {
120            continue;
121        }
122
123        /* remove in-line comments */
124        limit=uprv_strchr(start, '#');
125        if(limit!=NULL) {
126            /* get white space before the pound sign */
127            while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
128                --limit;
129            }
130
131            /* truncate the line */
132            *limit=0;
133        }
134
135        /* skip lines with only whitespace */
136        if(u_skipWhitespace(start)[0]==0) {
137            continue;
138        }
139
140        /* for each field, call the corresponding field function */
141        for(i=0; i<fieldCount; ++i) {
142            /* set the limit pointer of this field */
143            limit=start;
144            while(*limit!=delimiter && *limit!=0) {
145                ++limit;
146            }
147
148            /* set the field start and limit in the fields array */
149            fields[i][0]=start;
150            fields[i][1]=limit;
151
152            /* set start to the beginning of the next field, if any */
153            start=limit;
154            if(*start!=0) {
155                ++start;
156            } else if(i+1<fieldCount) {
157                *pErrorCode=U_PARSE_ERROR;
158                limit=line+length;
159                i=fieldCount;
160                break;
161            }
162        }
163
164        /* error in a field function? */
165        if(U_FAILURE(*pErrorCode)) {
166            break;
167        }
168
169        /* call the field function */
170        lineFn(context, fields, fieldCount, pErrorCode);
171        if(U_FAILURE(*pErrorCode)) {
172            break;
173        }
174    }
175
176    if(filename!=NULL) {
177        T_FileStream_close(file);
178    }
179}
180
181/*
182 * parse a list of code points
183 * store them as a UTF-32 string in dest[destCapacity]
184 * return the number of code points
185 */
186U_CAPI int32_t U_EXPORT2
187u_parseCodePoints(const char *s,
188                  uint32_t *dest, int32_t destCapacity,
189                  UErrorCode *pErrorCode) {
190    char *end;
191    uint32_t value;
192    int32_t count;
193
194    if(U_FAILURE(*pErrorCode)) {
195        return 0;
196    }
197    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
198        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
199        return 0;
200    }
201
202    count=0;
203    for(;;) {
204        s=u_skipWhitespace(s);
205        if(*s==';' || *s==0) {
206            return count;
207        }
208
209        /* read one code point */
210        value=(uint32_t)uprv_strtoul(s, &end, 16);
211        if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
212            *pErrorCode=U_PARSE_ERROR;
213            return 0;
214        }
215
216        /* append it to the destination array */
217        if(count<destCapacity) {
218            dest[count++]=value;
219        } else {
220            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
221        }
222
223        /* go to the following characters */
224        s=end;
225    }
226}
227
228/*
229 * parse a list of code points
230 * store them as a string in dest[destCapacity]
231 * set the first code point in *pFirst
232 * @return The length of the string in numbers of UChars.
233 */
234U_CAPI int32_t U_EXPORT2
235u_parseString(const char *s,
236              UChar *dest, int32_t destCapacity,
237              uint32_t *pFirst,
238              UErrorCode *pErrorCode) {
239    char *end;
240    uint32_t value;
241    int32_t destLength;
242
243    if(U_FAILURE(*pErrorCode)) {
244        return 0;
245    }
246    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
247        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
248        return 0;
249    }
250
251    if(pFirst!=NULL) {
252        *pFirst=0xffffffff;
253    }
254
255    destLength=0;
256    for(;;) {
257        s=u_skipWhitespace(s);
258        if(*s==';' || *s==0) {
259            if(destLength<destCapacity) {
260                dest[destLength]=0;
261            } else if(destLength==destCapacity) {
262                *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
263            } else {
264                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265            }
266            return destLength;
267        }
268
269        /* read one code point */
270        value=(uint32_t)uprv_strtoul(s, &end, 16);
271        if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
272            *pErrorCode=U_PARSE_ERROR;
273            return 0;
274        }
275
276        /* store the first code point */
277        if(pFirst!=NULL) {
278            *pFirst=value;
279            pFirst=NULL;
280        }
281
282        /* append it to the destination array */
283        if((destLength+U16_LENGTH(value))<=destCapacity) {
284            U16_APPEND_UNSAFE(dest, destLength, value);
285        } else {
286            destLength+=U16_LENGTH(value);
287        }
288
289        /* go to the following characters */
290        s=end;
291    }
292}
293
294/* read a range like start or start..end */
295U_CAPI int32_t U_EXPORT2
296u_parseCodePointRangeAnyTerminator(const char *s,
297                                   uint32_t *pStart, uint32_t *pEnd,
298                                   const char **terminator,
299                                   UErrorCode *pErrorCode) {
300    char *end;
301    uint32_t value;
302
303    if(U_FAILURE(*pErrorCode)) {
304        return 0;
305    }
306    if(s==NULL || pStart==NULL || pEnd==NULL) {
307        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
308        return 0;
309    }
310
311    /* read the start code point */
312    s=u_skipWhitespace(s);
313    value=(uint32_t)uprv_strtoul(s, &end, 16);
314    if(end<=s || value>=0x110000) {
315        *pErrorCode=U_PARSE_ERROR;
316        return 0;
317    }
318    *pStart=*pEnd=value;
319
320    /* is there a "..end"? */
321    s=u_skipWhitespace(end);
322    if(*s!='.' || s[1]!='.') {
323        *terminator=end;
324        return 1;
325    }
326    s=u_skipWhitespace(s+2);
327
328    /* read the end code point */
329    value=(uint32_t)uprv_strtoul(s, &end, 16);
330    if(end<=s || value>=0x110000) {
331        *pErrorCode=U_PARSE_ERROR;
332        return 0;
333    }
334    *pEnd=value;
335
336    /* is this a valid range? */
337    if(value<*pStart) {
338        *pErrorCode=U_PARSE_ERROR;
339        return 0;
340    }
341
342    *terminator=end;
343    return value-*pStart+1;
344}
345
346U_CAPI int32_t U_EXPORT2
347u_parseCodePointRange(const char *s,
348                      uint32_t *pStart, uint32_t *pEnd,
349                      UErrorCode *pErrorCode) {
350    const char *terminator;
351    int32_t rangeLength=
352        u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
353    if(U_SUCCESS(*pErrorCode)) {
354        terminator=u_skipWhitespace(terminator);
355        if(*terminator!=';' && *terminator!=0) {
356            *pErrorCode=U_PARSE_ERROR;
357            return 0;
358        }
359    }
360    return rangeLength;
361}
362
363U_CAPI int32_t U_EXPORT2
364u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
365    const char *read = source;
366    int32_t i = 0;
367    unsigned int value = 0;
368    if(sLen == -1) {
369        sLen = (int32_t)strlen(source);
370    }
371
372    while(read < source+sLen) {
373        sscanf(read, "%2x", &value);
374        if(i < destCapacity) {
375            dest[i] = (char)value;
376        }
377        i++;
378        read += 2;
379    }
380    return u_terminateChars(dest, destCapacity, i, status);
381}
382