1/* 2******************************************************************************* 3* 4* Copyright (C) 2009-2014, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: gennorm2.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2009nov25 14* created by: Markus W. Scherer 15* 16* This program reads text files that define Unicode normalization, 17* parses them, and builds a binary data file. 18*/ 19 20#include "unicode/utypes.h" 21#include "n2builder.h" 22 23#include <stdio.h> 24#include <stdlib.h> 25#include <string.h> 26#include "unicode/errorcode.h" 27#include "unicode/localpointer.h" 28#include "unicode/putil.h" 29#include "unicode/uchar.h" 30#include "unicode/unistr.h" 31#include "charstr.h" 32#include "normalizer2impl.h" 33#include "toolutil.h" 34#include "uoptions.h" 35#include "uparse.h" 36 37#if UCONFIG_NO_NORMALIZATION 38#include "unewdata.h" 39#endif 40 41U_NAMESPACE_BEGIN 42 43UBool beVerbose=FALSE, haveCopyright=TRUE; 44 45U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); 46 47#if !UCONFIG_NO_NORMALIZATION 48void parseFile(FILE *f, Normalizer2DataBuilder &builder); 49#endif 50 51/* -------------------------------------------------------------------------- */ 52 53enum { 54 HELP_H, 55 HELP_QUESTION_MARK, 56 VERBOSE, 57 COPYRIGHT, 58 SOURCEDIR, 59 OUTPUT_FILENAME, 60 UNICODE_VERSION, 61 WRITE_C_SOURCE, 62 OPT_FAST 63}; 64 65static UOption options[]={ 66 UOPTION_HELP_H, 67 UOPTION_HELP_QUESTION_MARK, 68 UOPTION_VERBOSE, 69 UOPTION_COPYRIGHT, 70 UOPTION_SOURCEDIR, 71 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), 72 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 73 UOPTION_DEF("csource", '\1', UOPT_NO_ARG), 74 UOPTION_DEF("fast", '\1', UOPT_NO_ARG) 75}; 76 77extern "C" int 78main(int argc, char* argv[]) { 79 U_MAIN_INIT_ARGS(argc, argv); 80 81 /* preset then read command line options */ 82 options[SOURCEDIR].value=""; 83 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); 84 85 /* error handling, printing usage message */ 86 if(argc<0) { 87 fprintf(stderr, 88 "error in command line argument \"%s\"\n", 89 argv[-argc]); 90 } 91 if(!options[OUTPUT_FILENAME].doesOccur) { 92 argc=-1; 93 } 94 if( argc<2 || 95 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur 96 ) { 97 /* 98 * Broken into chunks because the C89 standard says the minimum 99 * required supported string length is 509 bytes. 100 */ 101 fprintf(stderr, 102 "Usage: %s [-options] infiles+ -o outputfilename\n" 103 "\n" 104 "Reads the infiles with normalization data and\n" 105 "creates a binary or C source file (outputfilename) with the data.\n" 106 "\n", 107 argv[0]); 108 fprintf(stderr, 109 "Options:\n" 110 "\t-h or -? or --help this usage text\n" 111 "\t-v or --verbose verbose output\n" 112 "\t-c or --copyright include a copyright notice\n" 113 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); 114 fprintf(stderr, 115 "\t-s or --sourcedir source directory, followed by the path\n" 116 "\t-o or --output output filename\n" 117 "\t --csource writes a C source file with initializers\n"); 118 fprintf(stderr, 119 "\t --fast optimize the data for fast normalization,\n" 120 "\t which might increase its size (Writes fully decomposed\n" 121 "\t regular mappings instead of delta mappings.\n" 122 "\t You should measure the runtime speed to make sure that\n" 123 "\t this is a good trade-off.)\n"); 124 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 125 } 126 127 beVerbose=options[VERBOSE].doesOccur; 128 haveCopyright=options[COPYRIGHT].doesOccur; 129 130 IcuToolErrorCode errorCode("gennorm2/main()"); 131 132#if UCONFIG_NO_NORMALIZATION 133 134 fprintf(stderr, 135 "gennorm2 writes a dummy binary data file " 136 "because UCONFIG_NO_NORMALIZATION is set, \n" 137 "see icu/source/common/unicode/uconfig.h\n"); 138 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); 139 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. 140 // return U_UNSUPPORTED_ERROR; 141 return 0; 142 143#else 144 145 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode); 146 errorCode.assertSuccess(); 147 148 if(options[UNICODE_VERSION].doesOccur) { 149 builder->setUnicodeVersion(options[UNICODE_VERSION].value); 150 } 151 152 if(options[OPT_FAST].doesOccur) { 153 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); 154 } 155 156 // prepare the filename beginning with the source dir 157 CharString filename(options[SOURCEDIR].value, errorCode); 158 int32_t pathLength=filename.length(); 159 if( pathLength>0 && 160 filename[pathLength-1]!=U_FILE_SEP_CHAR && 161 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR 162 ) { 163 filename.append(U_FILE_SEP_CHAR, errorCode); 164 pathLength=filename.length(); 165 } 166 167 for(int i=1; i<argc; ++i) { 168 printf("gennorm2: processing %s\n", argv[i]); 169 filename.append(argv[i], errorCode); 170 LocalStdioFilePointer f(fopen(filename.data(), "r")); 171 if(f==NULL) { 172 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); 173 exit(U_FILE_ACCESS_ERROR); 174 } 175 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); 176 parseFile(f.getAlias(), *builder); 177 filename.truncate(pathLength); 178 } 179 180 if(options[WRITE_C_SOURCE].doesOccur) { 181 builder->writeCSourceFile(options[OUTPUT_FILENAME].value); 182 } else { 183 builder->writeBinaryFile(options[OUTPUT_FILENAME].value); 184 } 185 186 return errorCode.get(); 187 188#endif 189} 190 191#if !UCONFIG_NO_NORMALIZATION 192 193void parseFile(FILE *f, Normalizer2DataBuilder &builder) { 194 IcuToolErrorCode errorCode("gennorm2/parseFile()"); 195 char line[300]; 196 uint32_t startCP, endCP; 197 while(NULL!=fgets(line, (int)sizeof(line), f)) { 198 char *comment=(char *)strchr(line, '#'); 199 if(comment!=NULL) { 200 *comment=0; 201 } 202 u_rtrim(line); 203 if(line[0]==0) { 204 continue; // skip empty and comment-only lines 205 } 206 if(line[0]=='*') { 207 const char *s=u_skipWhitespace(line+1); 208 if(0==strncmp(s, "Unicode", 7)) { 209 s=u_skipWhitespace(s+7); 210 builder.setUnicodeVersion(s); 211 } 212 continue; // reserved syntax 213 } 214 const char *delimiter; 215 int32_t rangeLength= 216 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); 217 if(errorCode.isFailure()) { 218 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); 219 exit(errorCode.reset()); 220 } 221 delimiter=u_skipWhitespace(delimiter); 222 if(*delimiter==':') { 223 const char *s=u_skipWhitespace(delimiter+1); 224 char *end; 225 unsigned long value=strtoul(s, &end, 10); 226 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { 227 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); 228 exit(U_PARSE_ERROR); 229 } 230 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 231 builder.setCC(c, (uint8_t)value); 232 } 233 continue; 234 } 235 if(*delimiter=='-') { 236 if(*u_skipWhitespace(delimiter+1)!=0) { 237 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); 238 exit(U_PARSE_ERROR); 239 } 240 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 241 builder.removeMapping(c); 242 } 243 continue; 244 } 245 if(*delimiter=='=' || *delimiter=='>') { 246 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; 247 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode); 248 if(errorCode.isFailure()) { 249 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); 250 exit(errorCode.reset()); 251 } 252 UnicodeString mapping(FALSE, uchars, length); 253 if(*delimiter=='=') { 254 if(rangeLength!=1) { 255 fprintf(stderr, 256 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", 257 line); 258 exit(U_PARSE_ERROR); 259 } 260 builder.setRoundTripMapping((UChar32)startCP, mapping); 261 } else { 262 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 263 builder.setOneWayMapping(c, mapping); 264 } 265 } 266 continue; 267 } 268 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); 269 exit(U_PARSE_ERROR); 270 } 271} 272 273#endif // !UCONFIG_NO_NORMALIZATION 274 275U_NAMESPACE_END 276 277/* 278 * Hey, Emacs, please set the following: 279 * 280 * Local Variables: 281 * indent-tabs-mode: nil 282 * End: 283 * 284 */ 285