1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru******************************************************************************* 3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Copyright (C) 2000-2010, International Business Machines 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru******************************************************************************* 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* file name: uparse.h 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* encoding: US-ASCII 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* tab size: 8 (not used) 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* indentation:4 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* created on: 2000apr18 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* created by: Markus W. Scherer 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* This file provides a parser for files that are delimited by one single 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* character like ';' or TAB. Example: the Unicode Character Properties files 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* like UnicodeData.txt are semicolon-delimited. 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifndef __UPARSE_H__ 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define __UPARSE_H__ 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2627f654740f2a26ad62a5c155af9199af9e69b889claireho/** 2727f654740f2a26ad62a5c155af9199af9e69b889claireho * Is c an invariant-character whitespace? 2827f654740f2a26ad62a5c155af9199af9e69b889claireho * @param c invariant character 2927f654740f2a26ad62a5c155af9199af9e69b889claireho */ 3027f654740f2a26ad62a5c155af9199af9e69b889claireho#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n') 3127f654740f2a26ad62a5c155af9199af9e69b889claireho 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CDECL_BEGIN 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Skip space ' ' and TAB '\t' characters. 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param s Pointer to characters. 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return Pointer to first character at or after s that is not a space or TAB. 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI const char * U_EXPORT2 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruu_skipWhitespace(const char *s); 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Trim whitespace (including line endings) from the end of the string. 4550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @param s Pointer to the string. 4750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @return Pointer to the new end of the string. 4850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 4950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CAPI char * U_EXPORT2 5050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehou_rtrim(char *s); 5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** Function type for u_parseDelimitedFile(). */ 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querutypedef void U_CALLCONV 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUParseLineFn(void *context, 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *fields[][2], 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fieldCount, 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *pErrorCode); 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Parser for files that are similar to UnicodeData.txt: 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * This function opens the file and reads it line by line. It skips empty lines 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * and comment lines that start with a '#'. 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * All other lines are separated into fields with one delimiter character 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * (semicolon for Unicode Properties files) between two fields. The last field in 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * a line does not need to be terminated with a delimiter. 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * For each line, after segmenting it, a line function is called. 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * It gets passed the array of field start and limit pointers that is 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * passed into this parser and filled by it for each line. 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * For each field i of the line, the start pointer in fields[i][0] 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * points to the beginning of the field, while the limit pointer in fields[i][1] 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * points behind the field, i.e., to the delimiter or the line end. 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The context parameter of the line function is 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * the same as the one for the parse function. 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * The line function may modify the contents of the fields including the 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * limit characters. 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If the file cannot be opened, or there is a parsing error or a field function 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code. 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI void U_EXPORT2 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruu_parseDelimitedFile(const char *filename, char delimiter, 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *fields[][2], int32_t fieldCount, 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseLineFn *lineFn, void *context, 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *pErrorCode); 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Parse a string of code points like 0061 0308 0300. 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * s must end with either ';' or NUL. 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return Number of code points. 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruu_parseCodePoints(const char *s, 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t *dest, int32_t destCapacity, 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *pErrorCode); 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Parse a list of code points like 0061 0308 0300 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * into a UChar * string. 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * s must end with either ';' or NUL. 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Set the first code point in *pFirst. 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param s Input char * string. 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param dest Output string buffer. 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param destCapacity Capacity of dest in numbers of UChars. 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param pFirst If pFirst!=NULL the *pFirst will be set to the first 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * code point in the string. 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @param pErrorCode ICU error code. 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return The length of the string in numbers of UChars. 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruu_parseString(const char *s, 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *dest, int32_t destCapacity, 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t *pFirst, 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *pErrorCode); 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Parse a code point range like 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 0085 or 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 4E00..9FA5. 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * s must contain such a range and end with either ';' or NUL. 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @return Length of code point range, end-start+1 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruu_parseCodePointRange(const char *s, 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t *pStart, uint32_t *pEnd, 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *pErrorCode); 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Same as u_parseCodePointRange() but the range may be terminated by 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * any character. The position of the terminating character is returned via 13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the *terminator output parameter. 13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 14050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CAPI int32_t U_EXPORT2 14150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehou_parseCodePointRangeAnyTerminator(const char *s, 14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint32_t *pStart, uint32_t *pEnd, 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const char **terminator, 14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode *pErrorCode); 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruu_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status); 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_CDECL_END 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 152