1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Copyright (C) 2002-2009, International Business Machines 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* File genbrk.c 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------- 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Tool for generating RuleBasedBreakIterator data files (.brk files). 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// .brk files contain the precompiled rules for standard types 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of iterators - word, line, sentence, etc. 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Usage: genbrk [options] -r rule-file.txt -o output-file.brk 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// options: -v verbose 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// -? or -h help 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The input rule file is a plain text file containing break rules 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// in the input format accepted by RuleBasedBreakIterators. The 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// file can be encoded as utf-8, or utf-16 (either endian), or 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// in the default code page (platform dependent.). utf encoded 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// files must include a BOM. 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------- 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucnv.h" 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/unistr.h" 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/rbbi.h" 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uclean.h" 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/udata.h" 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/putil.h" 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uoptions.h" 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unewdata.h" 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "ucmndata.h" 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "rbbidata.h" 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdio.h> 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdlib.h> 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <string.h> 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_USE 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic char *progName; 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UOption options[]={ 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_HELP_H, /* 0 */ 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_HELP_QUESTION_MARK, /* 1 */ 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_VERBOSE, /* 2 */ 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_ICUDATADIR, /* 5 */ 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_DESTDIR, /* 6 */ 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UOPTION_COPYRIGHT, /* 7 */ 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid usageAndDie(int retCode) { 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("\tRead in break iteration rules text and write out the binary data\n" 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "options:\n" 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-h or -? or --help this usage text\n" 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-V or --version show a version message\n" 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-c or --copyright include a copyright notice\n" 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-v or --verbose turn on verbose output\n" 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t followed by path, defaults to %s\n" 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\t-d or --destdir destination directory, followed by the path\n", 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getDataDirectory()); 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit (retCode); 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* dummy UDataInfo cf. udata.h */ 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UDataInfo dummyDataInfo = { 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UDataInfo), 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_IS_BIG_ENDIAN, 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_CHARSET_FAMILY, 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SIZEOF_UCHAR, 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0, 0, 0, 0 }, /* dummy dataFormat */ 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0, 0, 0, 0 }, /* dummy formatVersion */ 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0, 0, 0, 0 } /* dummy dataVersion */ 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#else 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Set up the ICU data header, defined in ucmndata.h 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruDataHeader dh ={ 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru {sizeof(DataHeader), // Struct MappedData 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0xda, 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x27}, 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { // struct UDataInfo 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sizeof(UDataInfo), // size 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // reserved 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_IS_BIG_ENDIAN, 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_CHARSET_FAMILY, 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SIZEOF_UCHAR, 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // reserved 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // from the RBBI rule builder. The values declared 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // here should never appear in any real RBBI data. 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4, 1, 0, 0 } // dataVersion (Unicode version) 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }}; 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// main for genbrk 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint main(int argc, char **argv) { 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *ruleFileName; 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *outFileName; 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *outDir = NULL; 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *copyright = NULL; 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Pick up and check the command line arguments, 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // using the standard ICU tool utils option handling. 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_MAIN_INIT_ARGS(argc, argv); 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru progName = argv[0]; 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(argc<0) { 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Unrecognized option 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(options[0].doesOccur || options[1].doesOccur) { 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // -? or -h for help. 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usageAndDie(0); 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!(options[3].doesOccur && options[4].doesOccur)) { 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "rule file and output file must both be specified.\n"); 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileName = options[3].value; 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outFileName = options[4].value; 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (options[5].doesOccur) { 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_setDataDirectory(options[5].value); 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* Combine the directory with the file name */ 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(options[6].doesOccur) { 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outDir = options[6].value; 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (options[7].doesOccur) { 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru copyright = U_COPYRIGHT_STRING; 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 17250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNewDataMemory *pData; 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char msg[1024]; 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* write message with just the name */ 17850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "%s\n", msg); 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* write the dummy data file */ 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru udata_writeBlock(pData, msg, strlen(msg)); 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru udata_finish(pData, &status); 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (int)status; 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#else 18850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* Initialize ICU */ 18950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho u_init(&status); 19050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (U_FAILURE(status)) { 19150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 19250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho argv[0], u_errorName(status)); 19350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(1); 19450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 19550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho status = U_ZERO_ERROR; 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Read in the rule source file 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru long result; 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru long ruleFileSize; 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru FILE *file; 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *ruleBufferC; 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru file = fopen(ruleFileName, "rb"); 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if( file == 0 ) { 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(-1); 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fseek(file, 0, SEEK_END); 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize = ftell(file); 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fseek(file, 0, SEEK_SET); 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleBufferC = new char[ruleFileSize+10]; 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru result = (long)fread(ruleBufferC, 1, ruleFileSize, file); 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != ruleFileSize) { 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit (-1); 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleBufferC[ruleFileSize]=0; 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fclose(file); 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look for a Unicode Signature (BOM) on the rule file 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t signatureLength; 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char * ruleSourceC = ruleBufferC; 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char* encoding = ucnv_detectUnicodeSignature( 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC, ruleFileSize, &signatureLength, &status); 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(encoding!=NULL ){ 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC += signatureLength; 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize -= signatureLength; 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open a converter to take the rule file to UTF-16 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UConverter* conv; 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru conv = ucnv_open(encoding, &status); 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Convert the rules to UChar. 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Preflight first to determine required buffer size. 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t destCap = ucnv_toUChars(conv, 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, // dest, 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // destCapacity, 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC, 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize, 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status); 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (status != U_BUFFER_OVERFLOW_ERROR) { 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *ruleSourceU = new UChar[destCap+1]; 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_toUChars(conv, 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceU, // dest, 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru destCap+1, 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleSourceC, 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ruleFileSize, 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status); 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_close(conv); 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Put the source rules into a UnicodeString 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap); 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Create the break iterator from the rules 285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This will compile the rules. 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseError parseError; 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseError.line = 0; 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseError.offset = 0; 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_errorName(status), (int)parseError.line, (int)parseError.offset); 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get the compiled rule data from the break iterator. 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint32_t outDataSize; 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const uint8_t *outData; 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outData = bi->getBinaryRules(outDataSize); 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Copy the data format version numbers from the RBBI data header into the UDataMemory header. 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Create the output file 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru size_t bytesWritten; 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNewDataMemory *pData; 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru outFileName, u_errorName(status)); 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Write the data itself. 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru udata_writeBlock(pData, outData, outDataSize); 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // finish up 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bytesWritten = udata_finish(pData, &status); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "genbrk: error %d writing the output file\n", status); 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(status); 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bytesWritten != outDataSize) { 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru exit(-1); 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] ruleSourceU; 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete[] ruleBufferC; 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_cleanup(); 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("genbrk: tool completed successfully.\n"); 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 347