uparse.c revision 51cfa1a9a96cad34675a6415fe86dfdf3f525bb6
1/* 2******************************************************************************* 3* 4* Copyright (C) 2000-2007, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uparse.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2000apr18 14* created by: Markus W. Scherer 15* 16* This file provides a parser for files that are delimited by one single 17* character like ';' or TAB. Example: the Unicode Character Properties files 18* like UnicodeData.txt are semicolon-delimited. 19*/ 20 21#include "unicode/utypes.h" 22#include "cstring.h" 23#include "filestrm.h" 24#include "uparse.h" 25#include "unicode/uchar.h" 26#include "unicode/ustring.h" 27#include "ustr_imp.h" 28 29#include <stdio.h> 30 31U_CAPI const char * U_EXPORT2 32u_skipWhitespace(const char *s) { 33 while(*s==' ' || *s=='\t') { 34 ++s; 35 } 36 return s; 37} 38 39/* 40 * If the string starts with # @missing: then return the pointer to the 41 * following non-whitespace character. 42 * Otherwise return the original pointer. 43 * Unicode 5.0 adds such lines in some data files to document 44 * default property values. 45 * Poor man's regex for variable amounts of white space. 46 */ 47static const char * 48getMissingLimit(const char *s) { 49 const char *s0=s; 50 if( 51 *(s=u_skipWhitespace(s))=='#' && 52 *(s=u_skipWhitespace(s+1))=='@' && 53 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && 54 *(s=u_skipWhitespace(s+7))==':' 55 ) { 56 return u_skipWhitespace(s+1); 57 } else { 58 return s0; 59 } 60} 61 62U_CAPI void U_EXPORT2 63u_parseDelimitedFile(const char *filename, char delimiter, 64 char *fields[][2], int32_t fieldCount, 65 UParseLineFn *lineFn, void *context, 66 UErrorCode *pErrorCode) { 67 FileStream *file; 68 char line[300]; 69 char *start, *limit; 70 int32_t i, length; 71 72 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 73 return; 74 } 75 76 if(fields==NULL || lineFn==NULL || fieldCount<=0) { 77 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 78 return; 79 } 80 81 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { 82 filename=NULL; 83 file=T_FileStream_stdin(); 84 } else { 85 file=T_FileStream_open(filename, "r"); 86 } 87 if(file==NULL) { 88 *pErrorCode=U_FILE_ACCESS_ERROR; 89 return; 90 } 91 92 while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) { 93 length=(int32_t)uprv_strlen(line); 94 95 /* remove trailing newline characters */ 96 while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) { 97 line[--length]=0; 98 } 99 100 /* 101 * detect a line with # @missing: 102 * start parsing after that, or else from the beginning of the line 103 * set the default warning for @missing lines 104 */ 105 start=(char *)getMissingLimit(line); 106 if(start==line) { 107 *pErrorCode=U_ZERO_ERROR; 108 } else { 109 *pErrorCode=U_USING_DEFAULT_WARNING; 110 } 111 112 /* skip this line if it is empty or a comment */ 113 if(*start==0 || *start=='#') { 114 continue; 115 } 116 117 /* remove in-line comments */ 118 limit=uprv_strchr(start, '#'); 119 if(limit!=NULL) { 120 /* get white space before the pound sign */ 121 while(limit>start && (*(limit-1)==' ' || *(limit-1)=='\t')) { 122 --limit; 123 } 124 125 /* truncate the line */ 126 *limit=0; 127 } 128 129 /* skip lines with only whitespace */ 130 if(u_skipWhitespace(start)[0]==0) { 131 continue; 132 } 133 134 /* for each field, call the corresponding field function */ 135 for(i=0; i<fieldCount; ++i) { 136 /* set the limit pointer of this field */ 137 limit=start; 138 while(*limit!=delimiter && *limit!=0) { 139 ++limit; 140 } 141 142 /* set the field start and limit in the fields array */ 143 fields[i][0]=start; 144 fields[i][1]=limit; 145 146 /* set start to the beginning of the next field, if any */ 147 start=limit; 148 if(*start!=0) { 149 ++start; 150 } else if(i+1<fieldCount) { 151 *pErrorCode=U_PARSE_ERROR; 152 limit=line+length; 153 i=fieldCount; 154 break; 155 } 156 } 157 158 /* error in a field function? */ 159 if(U_FAILURE(*pErrorCode)) { 160 break; 161 } 162 163 /* call the field function */ 164 lineFn(context, fields, fieldCount, pErrorCode); 165 if(U_FAILURE(*pErrorCode)) { 166 break; 167 } 168 } 169 170 if(filename!=NULL) { 171 T_FileStream_close(file); 172 } 173} 174 175/* 176 * parse a list of code points 177 * store them as a UTF-32 string in dest[destCapacity] 178 * return the number of code points 179 */ 180U_CAPI int32_t U_EXPORT2 181u_parseCodePoints(const char *s, 182 uint32_t *dest, int32_t destCapacity, 183 UErrorCode *pErrorCode) { 184 char *end; 185 uint32_t value; 186 int32_t count; 187 188 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 189 return 0; 190 } 191 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { 192 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 193 return 0; 194 } 195 196 count=0; 197 for(;;) { 198 s=u_skipWhitespace(s); 199 if(*s==';' || *s==0) { 200 return count; 201 } 202 203 /* read one code point */ 204 value=(uint32_t)uprv_strtoul(s, &end, 16); 205 if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) { 206 *pErrorCode=U_PARSE_ERROR; 207 return 0; 208 } 209 210 /* append it to the destination array */ 211 if(count<destCapacity) { 212 dest[count++]=value; 213 } else { 214 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 215 } 216 217 /* go to the following characters */ 218 s=end; 219 } 220} 221 222/* 223 * parse a list of code points 224 * store them as a string in dest[destCapacity] 225 * set the first code point in *pFirst 226 * @return The length of the string in numbers of UChars. 227 */ 228U_CAPI int32_t U_EXPORT2 229u_parseString(const char *s, 230 UChar *dest, int32_t destCapacity, 231 uint32_t *pFirst, 232 UErrorCode *pErrorCode) { 233 char *end; 234 uint32_t value; 235 int32_t destLength; 236 237 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 238 return 0; 239 } 240 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { 241 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 242 } 243 244 if(pFirst!=NULL) { 245 *pFirst=0xffffffff; 246 } 247 248 destLength=0; 249 for(;;) { 250 s=u_skipWhitespace(s); 251 if(*s==';' || *s==0) { 252 if(destLength<destCapacity) { 253 dest[destLength]=0; 254 } else if(destLength==destCapacity) { 255 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; 256 } else { 257 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 258 } 259 return destLength; 260 } 261 262 /* read one code point */ 263 value=(uint32_t)uprv_strtoul(s, &end, 16); 264 if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) { 265 *pErrorCode=U_PARSE_ERROR; 266 return 0; 267 } 268 269 /* store the first code point */ 270 if(destLength==0 && pFirst!=NULL) { 271 *pFirst=value; 272 } 273 274 /* append it to the destination array */ 275 if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) { 276 UTF_APPEND_CHAR_UNSAFE(dest, destLength, value); 277 } else { 278 destLength+=UTF_CHAR_LENGTH(value); 279 } 280 281 /* go to the following characters */ 282 s=end; 283 } 284} 285 286/* read a range like start or start..end */ 287U_CAPI int32_t U_EXPORT2 288u_parseCodePointRange(const char *s, 289 uint32_t *pStart, uint32_t *pEnd, 290 UErrorCode *pErrorCode) { 291 char *end; 292 uint32_t value; 293 294 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 295 return 0; 296 } 297 if(s==NULL || pStart==NULL || pEnd==NULL) { 298 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 299 return 0; 300 } 301 302 s=u_skipWhitespace(s); 303 if(*s==';' || *s==0) { 304 *pErrorCode=U_PARSE_ERROR; 305 return 0; 306 } 307 308 /* read the start code point */ 309 value=(uint32_t)uprv_strtoul(s, &end, 16); 310 if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) { 311 *pErrorCode=U_PARSE_ERROR; 312 return 0; 313 } 314 *pStart=*pEnd=value; 315 316 /* is there a "..end"? */ 317 s=u_skipWhitespace(end); 318 if(*s==';' || *s==0) { 319 return 1; 320 } 321 322 if(*s!='.' || s[1]!='.') { 323 *pErrorCode=U_PARSE_ERROR; 324 return 0; 325 } 326 s+=2; 327 328 /* read the end code point */ 329 value=(uint32_t)uprv_strtoul(s, &end, 16); 330 if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { 331 *pErrorCode=U_PARSE_ERROR; 332 return 0; 333 } 334 *pEnd=value; 335 336 /* is this a valid range? */ 337 if(value<*pStart) { 338 *pErrorCode=U_PARSE_ERROR; 339 return 0; 340 } 341 342 /* no garbage after that? */ 343 s=u_skipWhitespace(end); 344 if(*s==';' || *s==0) { 345 return value-*pStart+1; 346 } else { 347 *pErrorCode=U_PARSE_ERROR; 348 return 0; 349 } 350} 351 352U_CAPI int32_t U_EXPORT2 353u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) { 354 const char *read = source; 355 int32_t i = 0; 356 unsigned int value = 0; 357 if(sLen == -1) { 358 sLen = (int32_t)strlen(source); 359 } 360 361 while(read < source+sLen) { 362 sscanf(read, "%2x", &value); 363 if(i < destCapacity) { 364 dest[i] = (char)value; 365 } 366 i++; 367 read += 2; 368 } 369 return u_terminateChars(dest, destCapacity, i, status); 370} 371