1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 3c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert* Copyright (C) 2002-2015, International Business Machines 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* File genbrk.c 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------- 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Tool for generating RuleBasedBreakIterator data files (.brk files). 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// .brk files contain the precompiled rules for standard types 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of iterators - word, line, sentence, etc. 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Usage: genbrk [options] -r rule-file.txt -o output-file.brk 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// options: -v verbose 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// -? or -h help 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The input rule file is a plain text file containing break rules 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// in the input format accepted by RuleBasedBreakIterators. The 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// file can be encoded as utf-8, or utf-16 (either endian), or 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// in the default code page (platform dependent.). utf encoded 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// files must include a BOM. 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------- 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucnv.h" 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/unistr.h" 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/rbbi.h" 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uclean.h" 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/udata.h" 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/putil.h" 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uoptions.h" 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unewdata.h" 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "ucmndata.h" 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "rbbidata.h" 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdio.h> 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdlib.h> 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <string.h> 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_USE 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic char *progName; 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UOption options[]={ 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_HELP_H, /* 0 */ 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_HELP_QUESTION_MARK, /* 1 */ 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_VERBOSE, /* 2 */ 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_ICUDATADIR, /* 5 */ 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_DESTDIR, /* 6 */ 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_COPYRIGHT, /* 7 */ 59c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UOPTION_QUIET, /* 8 */ 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid usageAndDie(int retCode) { 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("\tRead in break iteration rules text and write out the binary data\n" 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "options:\n" 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-h or -? or --help this usage text\n" 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-V or --version show a version message\n" 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-c or --copyright include a copyright notice\n" 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-v or --verbose turn on verbose output\n" 70c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert "\t-q or --quiet do not display warnings and progress\n" 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t followed by path, defaults to %s\n" 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-d or --destdir destination directory, followed by the path\n", 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getDataDirectory()); 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit (retCode); 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 7950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* dummy UDataInfo cf. udata.h */ 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UDataInfo dummyDataInfo = { 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UDataInfo), 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_IS_BIG_ENDIAN, 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_CHARSET_FAMILY, 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SIZEOF_UCHAR, 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0, 0, 0, 0 }, /* dummy dataFormat */ 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0, 0, 0, 0 }, /* dummy formatVersion */ 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0, 0, 0, 0 } /* dummy dataVersion */ 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#else 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Set up the ICU data header, defined in ucmndata.h 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruDataHeader dh ={ 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru {sizeof(DataHeader), // Struct MappedData 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0xda, 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x27}, 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { // struct UDataInfo 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UDataInfo), // size 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // reserved 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_IS_BIG_ENDIAN, 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_CHARSET_FAMILY, 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SIZEOF_UCHAR, 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // reserved 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // from the RBBI rule builder. The values declared 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // here should never appear in any real RBBI data. 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4, 1, 0, 0 } // dataVersion (Unicode version) 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }}; 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// main for genbrk 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint main(int argc, char **argv) { 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *ruleFileName; 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *outFileName; 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *outDir = NULL; 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *copyright = NULL; 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Pick up and check the command line arguments, 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // using the standard ICU tool utils option handling. 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_MAIN_INIT_ARGS(argc, argv); 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru progName = argv[0]; 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(argc<0) { 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Unrecognized option 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(options[0].doesOccur || options[1].doesOccur) { 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // -? or -h for help. 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usageAndDie(0); 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!(options[3].doesOccur && options[4].doesOccur)) { 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "rule file and output file must both be specified.\n"); 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileName = options[3].value; 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outFileName = options[4].value; 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (options[5].doesOccur) { 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_setDataDirectory(options[5].value); 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Combine the directory with the file name */ 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(options[6].doesOccur) { 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outDir = options[6].value; 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (options[7].doesOccur) { 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru copyright = U_COPYRIGHT_STRING; 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 17450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNewDataMemory *pData; 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char msg[1024]; 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* write message with just the name */ 18050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "%s\n", msg); 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* write the dummy data file */ 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru udata_writeBlock(pData, msg, strlen(msg)); 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru udata_finish(pData, &status); 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (int)status; 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#else 19050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* Initialize ICU */ 19150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho u_init(&status); 19250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (U_FAILURE(status)) { 19350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 19450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho argv[0], u_errorName(status)); 19550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(1); 19650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 19750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho status = U_ZERO_ERROR; 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Read in the rule source file 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru long result; 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru long ruleFileSize; 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru FILE *file; 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *ruleBufferC; 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru file = fopen(ruleFileName, "rb"); 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if( file == 0 ) { 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(-1); 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fseek(file, 0, SEEK_END); 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize = ftell(file); 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fseek(file, 0, SEEK_SET); 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleBufferC = new char[ruleFileSize+10]; 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = (long)fread(ruleBufferC, 1, ruleFileSize, file); 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != ruleFileSize) { 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit (-1); 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleBufferC[ruleFileSize]=0; 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fclose(file); 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look for a Unicode Signature (BOM) on the rule file 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t signatureLength; 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char * ruleSourceC = ruleBufferC; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char* encoding = ucnv_detectUnicodeSignature( 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC, ruleFileSize, &signatureLength, &status); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(encoding!=NULL ){ 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC += signatureLength; 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize -= signatureLength; 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open a converter to take the rule file to UTF-16 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UConverter* conv; 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru conv = ucnv_open(encoding, &status); 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Convert the rules to UChar. 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Preflight first to determine required buffer size. 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t destCap = ucnv_toUChars(conv, 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, // dest, 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // destCapacity, 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC, 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize, 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status); 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (status != U_BUFFER_OVERFLOW_ERROR) { 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *ruleSourceU = new UChar[destCap+1]; 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_toUChars(conv, 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceU, // dest, 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destCap+1, 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC, 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize, 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status); 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_close(conv); 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Put the source rules into a UnicodeString 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap); 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Create the break iterator from the rules 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This will compile the rules. 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseError parseError; 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseError.line = 0; 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseError.offset = 0; 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_errorName(status), (int)parseError.line, (int)parseError.offset); 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get the compiled rule data from the break iterator. 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t outDataSize; 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const uint8_t *outData; 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outData = bi->getBinaryRules(outDataSize); 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Copy the data format version numbers from the RBBI data header into the UDataMemory header. 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Create the output file 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru size_t bytesWritten; 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNewDataMemory *pData; 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outFileName, u_errorName(status)); 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Write the data itself. 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru udata_writeBlock(pData, outData, outDataSize); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // finish up 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bytesWritten = udata_finish(pData, &status); 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "genbrk: error %d writing the output file\n", status); 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bytesWritten != outDataSize) { 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(-1); 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] ruleSourceU; 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] ruleBufferC; 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_cleanup(); 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 343c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if(!options[8].doesOccur) { 344c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert printf("genbrk: tool completed successfully.\n"); 345c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 351