1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 2009-2014, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: gennorm2.cpp 11* encoding: UTF-8 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2009nov25 16* created by: Markus W. Scherer 17* 18* This program reads text files that define Unicode normalization, 19* parses them, and builds a binary data file. 20*/ 21 22#include "unicode/utypes.h" 23#include "n2builder.h" 24 25#include <fstream> 26#include <stdio.h> 27#include <stdlib.h> 28#include <string> 29#include <string.h> 30#include "unicode/errorcode.h" 31#include "unicode/localpointer.h" 32#include "unicode/putil.h" 33#include "unicode/uchar.h" 34#include "unicode/unistr.h" 35#include "charstr.h" 36#include "normalizer2impl.h" 37#include "toolutil.h" 38#include "uoptions.h" 39#include "uparse.h" 40 41#if UCONFIG_NO_NORMALIZATION 42#include "unewdata.h" 43#endif 44 45U_NAMESPACE_BEGIN 46 47UBool beVerbose=FALSE, haveCopyright=TRUE; 48 49#if !UCONFIG_NO_NORMALIZATION 50void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder); 51#endif 52 53/* -------------------------------------------------------------------------- */ 54 55enum { 56 HELP_H, 57 HELP_QUESTION_MARK, 58 VERBOSE, 59 COPYRIGHT, 60 SOURCEDIR, 61 OUTPUT_FILENAME, 62 UNICODE_VERSION, 63 WRITE_C_SOURCE, 64 WRITE_COMBINED_DATA, 65 OPT_FAST 66}; 67 68static UOption options[]={ 69 UOPTION_HELP_H, 70 UOPTION_HELP_QUESTION_MARK, 71 UOPTION_VERBOSE, 72 UOPTION_COPYRIGHT, 73 UOPTION_SOURCEDIR, 74 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), 75 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 76 UOPTION_DEF("csource", '\1', UOPT_NO_ARG), 77 UOPTION_DEF("combined", '\1', UOPT_NO_ARG), 78 UOPTION_DEF("fast", '\1', UOPT_NO_ARG) 79}; 80 81extern "C" int 82main(int argc, char* argv[]) { 83 U_MAIN_INIT_ARGS(argc, argv); 84 85 /* preset then read command line options */ 86 options[SOURCEDIR].value=""; 87 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); 88 89 /* error handling, printing usage message */ 90 if(argc<0) { 91 fprintf(stderr, 92 "error in command line argument \"%s\"\n", 93 argv[-argc]); 94 } 95 if(!options[OUTPUT_FILENAME].doesOccur) { 96 argc=-1; 97 } 98 if( argc<2 || 99 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur 100 ) { 101 fprintf(stderr, 102 "Usage: %s [-options] infiles+ -o outputfilename\n" 103 "\n" 104 "Reads the infiles with normalization data and\n" 105 "creates a binary file, or a C source file (--csource), with the data,\n" 106 "or writes a data file with the combined data (--combined).\n" 107 "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n" 108 "\n" 109 "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n" 110 "\n" 111 "Computes the difference of (a, b) minus (p, q) and writes the diff data\n" 112 "in input-file syntax to the outputfilename.\n" 113 "It is then possible to build (p, q, diff) to get the same data as (a, b).\n" 114 "(Useful for computing minimal incremental mapping data files.)\n" 115 "\n", 116 argv[0], argv[0]); 117 fprintf(stderr, 118 "Options:\n" 119 "\t-h or -? or --help this usage text\n" 120 "\t-v or --verbose verbose output\n" 121 "\t-c or --copyright include a copyright notice\n" 122 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); 123 fprintf(stderr, 124 "\t-s or --sourcedir source directory, followed by the path\n" 125 "\t-o or --output output filename\n" 126 "\t --csource writes a C source file with initializers\n" 127 "\t --combined writes a .txt file (input-file syntax) with the\n" 128 "\t combined data from all of the input files\n"); 129 fprintf(stderr, 130 "\t --fast optimize the data for fast normalization,\n" 131 "\t which might increase its size (Writes fully decomposed\n" 132 "\t regular mappings instead of delta mappings.\n" 133 "\t You should measure the runtime speed to make sure that\n" 134 "\t this is a good trade-off.)\n"); 135 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 136 } 137 138 beVerbose=options[VERBOSE].doesOccur; 139 haveCopyright=options[COPYRIGHT].doesOccur; 140 141 IcuToolErrorCode errorCode("gennorm2/main()"); 142 143#if UCONFIG_NO_NORMALIZATION 144 145 fprintf(stderr, 146 "gennorm2 writes a dummy binary data file " 147 "because UCONFIG_NO_NORMALIZATION is set, \n" 148 "see icu/source/common/unicode/uconfig.h\n"); 149 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); 150 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. 151 // return U_UNSUPPORTED_ERROR; 152 return 0; 153 154#else 155 156 LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode); 157 LocalPointer<Normalizer2DataBuilder> b2; 158 LocalPointer<Normalizer2DataBuilder> diff; 159 Normalizer2DataBuilder *builder = b1.getAlias(); 160 errorCode.assertSuccess(); 161 162 if(options[UNICODE_VERSION].doesOccur) { 163 builder->setUnicodeVersion(options[UNICODE_VERSION].value); 164 } 165 166 if(options[OPT_FAST].doesOccur) { 167 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); 168 } 169 170 // prepare the filename beginning with the source dir 171 CharString filename(options[SOURCEDIR].value, errorCode); 172 int32_t pathLength=filename.length(); 173 if( pathLength>0 && 174 filename[pathLength-1]!=U_FILE_SEP_CHAR && 175 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR 176 ) { 177 filename.append(U_FILE_SEP_CHAR, errorCode); 178 pathLength=filename.length(); 179 } 180 181 bool doMinus = false; 182 for(int i=1; i<argc; ++i) { 183 printf("gennorm2: processing %s\n", argv[i]); 184 if(strcmp(argv[i], "minus") == 0) { 185 if(doMinus) { 186 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n"); 187 exit(U_ILLEGAL_ARGUMENT_ERROR); 188 } 189 // Data from previous input files has been collected in b1. 190 // Collect data from further input files in b2. 191 b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); 192 diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); 193 errorCode.assertSuccess(); 194 builder = b2.getAlias(); 195 if(options[UNICODE_VERSION].doesOccur) { 196 builder->setUnicodeVersion(options[UNICODE_VERSION].value); 197 } 198 if(options[OPT_FAST].doesOccur) { 199 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); 200 } 201 doMinus = true; 202 continue; 203 } 204 filename.append(argv[i], errorCode); 205 std::ifstream f(filename.data()); 206 if(f.fail()) { 207 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); 208 exit(U_FILE_ACCESS_ERROR); 209 } 210 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); 211 parseFile(f, *builder); 212 filename.truncate(pathLength); 213 } 214 215 if(doMinus) { 216 Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff); 217 diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true); 218 } else if(options[WRITE_COMBINED_DATA].doesOccur) { 219 builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false); 220 } else if(options[WRITE_C_SOURCE].doesOccur) { 221 builder->writeCSourceFile(options[OUTPUT_FILENAME].value); 222 } else { 223 builder->writeBinaryFile(options[OUTPUT_FILENAME].value); 224 } 225 226 return errorCode.get(); 227 228#endif 229} 230 231#if !UCONFIG_NO_NORMALIZATION 232 233void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) { 234 IcuToolErrorCode errorCode("gennorm2/parseFile()"); 235 std::string lineString; 236 uint32_t startCP, endCP; 237 while(std::getline(f, lineString)) { 238 if (lineString.empty()) { 239 continue; // skip empty lines. 240 } 241#if (U_CPLUSPLUS_VERSION >= 11) 242 char *line = &lineString.front(); 243#else 244 char *line = &lineString.at(0); 245#endif 246 char *comment=(char *)strchr(line, '#'); 247 if(comment!=NULL) { 248 *comment=0; 249 } 250 u_rtrim(line); 251 if(line[0]==0) { 252 continue; // skip empty and comment-only lines 253 } 254 if(line[0]=='*') { 255 const char *s=u_skipWhitespace(line+1); 256 if(0==strncmp(s, "Unicode", 7)) { 257 s=u_skipWhitespace(s+7); 258 builder.setUnicodeVersion(s); 259 } 260 continue; // reserved syntax 261 } 262 const char *delimiter; 263 int32_t rangeLength= 264 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); 265 if(errorCode.isFailure()) { 266 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); 267 exit(errorCode.reset()); 268 } 269 delimiter=u_skipWhitespace(delimiter); 270 if(*delimiter==':') { 271 const char *s=u_skipWhitespace(delimiter+1); 272 char *end; 273 unsigned long value=strtoul(s, &end, 10); 274 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { 275 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); 276 exit(U_PARSE_ERROR); 277 } 278 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 279 builder.setCC(c, (uint8_t)value); 280 } 281 continue; 282 } 283 if(*delimiter=='-') { 284 if(*u_skipWhitespace(delimiter+1)!=0) { 285 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); 286 exit(U_PARSE_ERROR); 287 } 288 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 289 builder.removeMapping(c); 290 } 291 continue; 292 } 293 if(*delimiter=='=' || *delimiter=='>') { 294 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; 295 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode); 296 if(errorCode.isFailure()) { 297 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); 298 exit(errorCode.reset()); 299 } 300 UnicodeString mapping(FALSE, uchars, length); 301 if(*delimiter=='=') { 302 if(rangeLength!=1) { 303 fprintf(stderr, 304 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", 305 line); 306 exit(U_PARSE_ERROR); 307 } 308 builder.setRoundTripMapping((UChar32)startCP, mapping); 309 } else { 310 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 311 builder.setOneWayMapping(c, mapping); 312 } 313 } 314 continue; 315 } 316 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); 317 exit(U_PARSE_ERROR); 318 } 319} 320 321#endif // !UCONFIG_NO_NORMALIZATION 322 323U_NAMESPACE_END 324 325/* 326 * Hey, Emacs, please set the following: 327 * 328 * Local Variables: 329 * indent-tabs-mode: nil 330 * End: 331 * 332 */ 333