1/* 2********************************************************************** 3* Copyright (C) 2002-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* 7* File genbrk.c 8*/ 9 10//-------------------------------------------------------------------- 11// 12// Tool for generating RuleBasedBreakIterator data files (.brk files). 13// .brk files contain the precompiled rules for standard types 14// of iterators - word, line, sentence, etc. 15// 16// Usage: genbrk [options] -r rule-file.txt -o output-file.brk 17// 18// options: -v verbose 19// -? or -h help 20// 21// The input rule file is a plain text file containing break rules 22// in the input format accepted by RuleBasedBreakIterators. The 23// file can be encoded as utf-8, or utf-16 (either endian), or 24// in the default code page (platform dependent.). utf encoded 25// files must include a BOM. 26// 27//-------------------------------------------------------------------- 28 29#include "unicode/utypes.h" 30#include "unicode/ucnv.h" 31#include "unicode/unistr.h" 32#include "unicode/rbbi.h" 33#include "unicode/uclean.h" 34#include "unicode/udata.h" 35#include "unicode/putil.h" 36 37#include "uoptions.h" 38#include "unewdata.h" 39#include "ucmndata.h" 40#include "rbbidata.h" 41#include "cmemory.h" 42 43#include <stdio.h> 44#include <stdlib.h> 45#include <string.h> 46 47U_NAMESPACE_USE 48 49static char *progName; 50static UOption options[]={ 51 UOPTION_HELP_H, /* 0 */ 52 UOPTION_HELP_QUESTION_MARK, /* 1 */ 53 UOPTION_VERBOSE, /* 2 */ 54 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ 56 UOPTION_ICUDATADIR, /* 5 */ 57 UOPTION_DESTDIR, /* 6 */ 58 UOPTION_COPYRIGHT, /* 7 */ 59 UOPTION_QUIET, /* 8 */ 60}; 61 62void usageAndDie(int retCode) { 63 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); 64 printf("\tRead in break iteration rules text and write out the binary data\n" 65 "options:\n" 66 "\t-h or -? or --help this usage text\n" 67 "\t-V or --version show a version message\n" 68 "\t-c or --copyright include a copyright notice\n" 69 "\t-v or --verbose turn on verbose output\n" 70 "\t-q or --quiet do not display warnings and progress\n" 71 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 72 "\t followed by path, defaults to %s\n" 73 "\t-d or --destdir destination directory, followed by the path\n", 74 u_getDataDirectory()); 75 exit (retCode); 76} 77 78 79#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 80 81/* dummy UDataInfo cf. udata.h */ 82static UDataInfo dummyDataInfo = { 83 sizeof(UDataInfo), 84 0, 85 86 U_IS_BIG_ENDIAN, 87 U_CHARSET_FAMILY, 88 U_SIZEOF_UCHAR, 89 0, 90 91 { 0, 0, 0, 0 }, /* dummy dataFormat */ 92 { 0, 0, 0, 0 }, /* dummy formatVersion */ 93 { 0, 0, 0, 0 } /* dummy dataVersion */ 94}; 95 96#else 97 98// 99// Set up the ICU data header, defined in ucmndata.h 100// 101DataHeader dh ={ 102 {sizeof(DataHeader), // Struct MappedData 103 0xda, 104 0x27}, 105 106 { // struct UDataInfo 107 sizeof(UDataInfo), // size 108 0, // reserved 109 U_IS_BIG_ENDIAN, 110 U_CHARSET_FAMILY, 111 U_SIZEOF_UCHAR, 112 0, // reserved 113 114 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " 115 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values 116 // from the RBBI rule builder. The values declared 117 // here should never appear in any real RBBI data. 118 { 4, 1, 0, 0 } // dataVersion (Unicode version) 119 }}; 120 121#endif 122 123//---------------------------------------------------------------------------- 124// 125// main for genbrk 126// 127//---------------------------------------------------------------------------- 128int main(int argc, char **argv) { 129 UErrorCode status = U_ZERO_ERROR; 130 const char *ruleFileName; 131 const char *outFileName; 132 const char *outDir = NULL; 133 const char *copyright = NULL; 134 135 // 136 // Pick up and check the command line arguments, 137 // using the standard ICU tool utils option handling. 138 // 139 U_MAIN_INIT_ARGS(argc, argv); 140 progName = argv[0]; 141 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 142 if(argc<0) { 143 // Unrecognized option 144 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 145 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 146 } 147 148 if(options[0].doesOccur || options[1].doesOccur) { 149 // -? or -h for help. 150 usageAndDie(0); 151 } 152 153 if (!(options[3].doesOccur && options[4].doesOccur)) { 154 fprintf(stderr, "rule file and output file must both be specified.\n"); 155 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 156 } 157 ruleFileName = options[3].value; 158 outFileName = options[4].value; 159 160 if (options[5].doesOccur) { 161 u_setDataDirectory(options[5].value); 162 } 163 164 status = U_ZERO_ERROR; 165 166 /* Combine the directory with the file name */ 167 if(options[6].doesOccur) { 168 outDir = options[6].value; 169 } 170 if (options[7].doesOccur) { 171 copyright = U_COPYRIGHT_STRING; 172 } 173 174#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 175 176 UNewDataMemory *pData; 177 char msg[1024]; 178 179 /* write message with just the name */ 180 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 181 fprintf(stderr, "%s\n", msg); 182 183 /* write the dummy data file */ 184 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 185 udata_writeBlock(pData, msg, strlen(msg)); 186 udata_finish(pData, &status); 187 return (int)status; 188 189#else 190 /* Initialize ICU */ 191 u_init(&status); 192 if (U_FAILURE(status)) { 193 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 194 argv[0], u_errorName(status)); 195 exit(1); 196 } 197 status = U_ZERO_ERROR; 198 199 // 200 // Read in the rule source file 201 // 202 long result; 203 long ruleFileSize; 204 FILE *file; 205 char *ruleBufferC; 206 207 file = fopen(ruleFileName, "rb"); 208 if( file == 0 ) { 209 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); 210 exit(-1); 211 } 212 fseek(file, 0, SEEK_END); 213 ruleFileSize = ftell(file); 214 fseek(file, 0, SEEK_SET); 215 ruleBufferC = new char[ruleFileSize+10]; 216 217 result = (long)fread(ruleBufferC, 1, ruleFileSize, file); 218 if (result != ruleFileSize) { 219 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); 220 exit (-1); 221 } 222 ruleBufferC[ruleFileSize]=0; 223 fclose(file); 224 225 // 226 // Look for a Unicode Signature (BOM) on the rule file 227 // 228 int32_t signatureLength; 229 const char * ruleSourceC = ruleBufferC; 230 const char* encoding = ucnv_detectUnicodeSignature( 231 ruleSourceC, ruleFileSize, &signatureLength, &status); 232 if (U_FAILURE(status)) { 233 exit(status); 234 } 235 if(encoding!=NULL ){ 236 ruleSourceC += signatureLength; 237 ruleFileSize -= signatureLength; 238 } 239 240 // 241 // Open a converter to take the rule file to UTF-16 242 // 243 UConverter* conv; 244 conv = ucnv_open(encoding, &status); 245 if (U_FAILURE(status)) { 246 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 247 exit(status); 248 } 249 250 // 251 // Convert the rules to UChar. 252 // Preflight first to determine required buffer size. 253 // 254 uint32_t destCap = ucnv_toUChars(conv, 255 NULL, // dest, 256 0, // destCapacity, 257 ruleSourceC, 258 ruleFileSize, 259 &status); 260 if (status != U_BUFFER_OVERFLOW_ERROR) { 261 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 262 exit(status); 263 }; 264 265 status = U_ZERO_ERROR; 266 UChar *ruleSourceU = new UChar[destCap+1]; 267 ucnv_toUChars(conv, 268 ruleSourceU, // dest, 269 destCap+1, 270 ruleSourceC, 271 ruleFileSize, 272 &status); 273 if (U_FAILURE(status)) { 274 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 275 exit(status); 276 }; 277 ucnv_close(conv); 278 279 280 // 281 // Put the source rules into a UnicodeString 282 // 283 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap); 284 285 // 286 // Create the break iterator from the rules 287 // This will compile the rules. 288 // 289 UParseError parseError; 290 parseError.line = 0; 291 parseError.offset = 0; 292 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); 293 if (U_FAILURE(status)) { 294 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", 295 u_errorName(status), (int)parseError.line, (int)parseError.offset); 296 exit(status); 297 }; 298 299 300 // 301 // Get the compiled rule data from the break iterator. 302 // 303 uint32_t outDataSize; 304 const uint8_t *outData; 305 outData = bi->getBinaryRules(outDataSize); 306 307 // Copy the data format version numbers from the RBBI data header into the UDataMemory header. 308 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); 309 310 // 311 // Create the output file 312 // 313 size_t bytesWritten; 314 UNewDataMemory *pData; 315 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 316 if(U_FAILURE(status)) { 317 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 318 outFileName, u_errorName(status)); 319 exit(status); 320 } 321 322 323 // Write the data itself. 324 udata_writeBlock(pData, outData, outDataSize); 325 // finish up 326 bytesWritten = udata_finish(pData, &status); 327 if(U_FAILURE(status)) { 328 fprintf(stderr, "genbrk: error %d writing the output file\n", status); 329 exit(status); 330 } 331 332 if (bytesWritten != outDataSize) { 333 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 334 exit(-1); 335 } 336 337 delete bi; 338 delete[] ruleSourceU; 339 delete[] ruleBufferC; 340 u_cleanup(); 341 342 343 if(!options[8].doesOccur) { 344 printf("genbrk: tool completed successfully.\n"); 345 } 346 return 0; 347 348#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 349} 350 351