1/*
2**********************************************************************
3*   Copyright (C) 2002-2015, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* File genbrk.c
8*/
9
10//--------------------------------------------------------------------
11//
12//   Tool for generating RuleBasedBreakIterator data files (.brk files).
13//   .brk files contain the precompiled rules for standard types
14//   of iterators - word, line, sentence, etc.
15//
16//   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
17//
18//       options:   -v         verbose
19//                  -? or -h   help
20//
21//   The input rule file is a plain text file containing break rules
22//    in the input format accepted by RuleBasedBreakIterators.  The
23//    file can be encoded as utf-8, or utf-16 (either endian), or
24//    in the default code page (platform dependent.).  utf encoded
25//    files must include a BOM.
26//
27//--------------------------------------------------------------------
28
29#include "unicode/utypes.h"
30#include "unicode/ucnv.h"
31#include "unicode/unistr.h"
32#include "unicode/rbbi.h"
33#include "unicode/uclean.h"
34#include "unicode/udata.h"
35#include "unicode/putil.h"
36
37#include "uoptions.h"
38#include "unewdata.h"
39#include "ucmndata.h"
40#include "rbbidata.h"
41#include "cmemory.h"
42
43#include <stdio.h>
44#include <stdlib.h>
45#include <string.h>
46
47U_NAMESPACE_USE
48
49static char *progName;
50static UOption options[]={
51    UOPTION_HELP_H,             /* 0 */
52    UOPTION_HELP_QUESTION_MARK, /* 1 */
53    UOPTION_VERBOSE,            /* 2 */
54    { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
55    { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
56    UOPTION_ICUDATADIR,         /* 5 */
57    UOPTION_DESTDIR,            /* 6 */
58    UOPTION_COPYRIGHT,          /* 7 */
59    UOPTION_QUIET,              /* 8 */
60};
61
62void usageAndDie(int retCode) {
63        printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
64        printf("\tRead in break iteration rules text and write out the binary data\n"
65            "options:\n"
66            "\t-h or -? or --help  this usage text\n"
67            "\t-V or --version     show a version message\n"
68            "\t-c or --copyright   include a copyright notice\n"
69            "\t-v or --verbose     turn on verbose output\n"
70            "\t-q or --quiet       do not display warnings and progress\n"
71            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
72            "\t                    followed by path, defaults to %s\n"
73            "\t-d or --destdir     destination directory, followed by the path\n",
74            u_getDataDirectory());
75        exit (retCode);
76}
77
78
79#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
80
81/* dummy UDataInfo cf. udata.h */
82static UDataInfo dummyDataInfo = {
83    sizeof(UDataInfo),
84    0,
85
86    U_IS_BIG_ENDIAN,
87    U_CHARSET_FAMILY,
88    U_SIZEOF_UCHAR,
89    0,
90
91    { 0, 0, 0, 0 },                 /* dummy dataFormat */
92    { 0, 0, 0, 0 },                 /* dummy formatVersion */
93    { 0, 0, 0, 0 }                  /* dummy dataVersion */
94};
95
96#else
97
98//
99//  Set up the ICU data header, defined in ucmndata.h
100//
101DataHeader dh ={
102    {sizeof(DataHeader),           // Struct MappedData
103        0xda,
104        0x27},
105
106    {                               // struct UDataInfo
107        sizeof(UDataInfo),          //     size
108        0,                          //     reserved
109        U_IS_BIG_ENDIAN,
110        U_CHARSET_FAMILY,
111        U_SIZEOF_UCHAR,
112        0,                          //     reserved
113
114    { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
115    { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
116                                    //      from the RBBI rule builder.  The  values declared
117                                    //      here should never appear in any real RBBI data.
118        { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
119    }};
120
121#endif
122
123//----------------------------------------------------------------------------
124//
125//  main      for genbrk
126//
127//----------------------------------------------------------------------------
128int  main(int argc, char **argv) {
129    UErrorCode  status = U_ZERO_ERROR;
130    const char *ruleFileName;
131    const char *outFileName;
132    const char *outDir = NULL;
133    const char *copyright = NULL;
134
135    //
136    // Pick up and check the command line arguments,
137    //    using the standard ICU tool utils option handling.
138    //
139    U_MAIN_INIT_ARGS(argc, argv);
140    progName = argv[0];
141    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
142    if(argc<0) {
143        // Unrecognized option
144        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
145        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
146    }
147
148    if(options[0].doesOccur || options[1].doesOccur) {
149        //  -? or -h for help.
150        usageAndDie(0);
151    }
152
153    if (!(options[3].doesOccur && options[4].doesOccur)) {
154        fprintf(stderr, "rule file and output file must both be specified.\n");
155        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
156    }
157    ruleFileName = options[3].value;
158    outFileName  = options[4].value;
159
160    if (options[5].doesOccur) {
161        u_setDataDirectory(options[5].value);
162    }
163
164    status = U_ZERO_ERROR;
165
166    /* Combine the directory with the file name */
167    if(options[6].doesOccur) {
168        outDir = options[6].value;
169    }
170    if (options[7].doesOccur) {
171        copyright = U_COPYRIGHT_STRING;
172    }
173
174#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
175
176    UNewDataMemory *pData;
177    char msg[1024];
178
179    /* write message with just the name */
180    sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
181    fprintf(stderr, "%s\n", msg);
182
183    /* write the dummy data file */
184    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
185    udata_writeBlock(pData, msg, strlen(msg));
186    udata_finish(pData, &status);
187    return (int)status;
188
189#else
190    /* Initialize ICU */
191    u_init(&status);
192    if (U_FAILURE(status)) {
193        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
194            argv[0], u_errorName(status));
195        exit(1);
196    }
197    status = U_ZERO_ERROR;
198
199    //
200    //  Read in the rule source file
201    //
202    long        result;
203    long        ruleFileSize;
204    FILE        *file;
205    char        *ruleBufferC;
206
207    file = fopen(ruleFileName, "rb");
208    if( file == 0 ) {
209        fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
210        exit(-1);
211    }
212    fseek(file, 0, SEEK_END);
213    ruleFileSize = ftell(file);
214    fseek(file, 0, SEEK_SET);
215    ruleBufferC = new char[ruleFileSize+10];
216
217    result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
218    if (result != ruleFileSize)  {
219        fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
220        exit (-1);
221    }
222    ruleBufferC[ruleFileSize]=0;
223    fclose(file);
224
225    //
226    // Look for a Unicode Signature (BOM) on the rule file
227    //
228    int32_t        signatureLength;
229    const char *   ruleSourceC = ruleBufferC;
230    const char*    encoding = ucnv_detectUnicodeSignature(
231                           ruleSourceC, ruleFileSize, &signatureLength, &status);
232    if (U_FAILURE(status)) {
233        exit(status);
234    }
235    if(encoding!=NULL ){
236        ruleSourceC  += signatureLength;
237        ruleFileSize -= signatureLength;
238    }
239
240    //
241    // Open a converter to take the rule file to UTF-16
242    //
243    UConverter* conv;
244    conv = ucnv_open(encoding, &status);
245    if (U_FAILURE(status)) {
246        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
247        exit(status);
248    }
249
250    //
251    // Convert the rules to UChar.
252    //  Preflight first to determine required buffer size.
253    //
254    uint32_t destCap = ucnv_toUChars(conv,
255                       NULL,           //  dest,
256                       0,              //  destCapacity,
257                       ruleSourceC,
258                       ruleFileSize,
259                       &status);
260    if (status != U_BUFFER_OVERFLOW_ERROR) {
261        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
262        exit(status);
263    };
264
265    status = U_ZERO_ERROR;
266    UChar *ruleSourceU = new UChar[destCap+1];
267    ucnv_toUChars(conv,
268                  ruleSourceU,     //  dest,
269                  destCap+1,
270                  ruleSourceC,
271                  ruleFileSize,
272                  &status);
273    if (U_FAILURE(status)) {
274        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
275        exit(status);
276    };
277    ucnv_close(conv);
278
279
280    //
281    //  Put the source rules into a UnicodeString
282    //
283    UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
284
285    //
286    //  Create the break iterator from the rules
287    //     This will compile the rules.
288    //
289    UParseError parseError;
290    parseError.line = 0;
291    parseError.offset = 0;
292    RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
293    if (U_FAILURE(status)) {
294        fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
295                u_errorName(status), (int)parseError.line, (int)parseError.offset);
296        exit(status);
297    };
298
299
300    //
301    //  Get the compiled rule data from the break iterator.
302    //
303    uint32_t        outDataSize;
304    const uint8_t  *outData;
305    outData = bi->getBinaryRules(outDataSize);
306
307    // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
308    uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
309
310    //
311    //  Create the output file
312    //
313    size_t bytesWritten;
314    UNewDataMemory *pData;
315    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
316    if(U_FAILURE(status)) {
317        fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
318                         outFileName, u_errorName(status));
319        exit(status);
320    }
321
322
323    //  Write the data itself.
324    udata_writeBlock(pData, outData, outDataSize);
325    // finish up
326    bytesWritten = udata_finish(pData, &status);
327    if(U_FAILURE(status)) {
328        fprintf(stderr, "genbrk: error %d writing the output file\n", status);
329        exit(status);
330    }
331
332    if (bytesWritten != outDataSize) {
333        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
334        exit(-1);
335    }
336
337    delete bi;
338    delete[] ruleSourceU;
339    delete[] ruleBufferC;
340    u_cleanup();
341
342
343    if(!options[8].doesOccur) {
344        printf("genbrk: tool completed successfully.\n");
345    }
346    return 0;
347
348#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
349}
350
351