1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2009-2014, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  gennorm2.cpp
11*   encoding:   UTF-8
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2009nov25
16*   created by: Markus W. Scherer
17*
18*   This program reads text files that define Unicode normalization,
19*   parses them, and builds a binary data file.
20*/
21
22#include "unicode/utypes.h"
23#include "n2builder.h"
24
25#include <fstream>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string>
29#include <string.h>
30#include "unicode/errorcode.h"
31#include "unicode/localpointer.h"
32#include "unicode/putil.h"
33#include "unicode/uchar.h"
34#include "unicode/unistr.h"
35#include "charstr.h"
36#include "normalizer2impl.h"
37#include "toolutil.h"
38#include "uoptions.h"
39#include "uparse.h"
40
41#if UCONFIG_NO_NORMALIZATION
42#include "unewdata.h"
43#endif
44
45U_NAMESPACE_BEGIN
46
47UBool beVerbose=FALSE, haveCopyright=TRUE;
48
49#if !UCONFIG_NO_NORMALIZATION
50void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
51#endif
52
53/* -------------------------------------------------------------------------- */
54
55enum {
56    HELP_H,
57    HELP_QUESTION_MARK,
58    VERBOSE,
59    COPYRIGHT,
60    SOURCEDIR,
61    OUTPUT_FILENAME,
62    UNICODE_VERSION,
63    WRITE_C_SOURCE,
64    WRITE_COMBINED_DATA,
65    OPT_FAST
66};
67
68static UOption options[]={
69    UOPTION_HELP_H,
70    UOPTION_HELP_QUESTION_MARK,
71    UOPTION_VERBOSE,
72    UOPTION_COPYRIGHT,
73    UOPTION_SOURCEDIR,
74    UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
75    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
76    UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
77    UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
78    UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
79};
80
81extern "C" int
82main(int argc, char* argv[]) {
83    U_MAIN_INIT_ARGS(argc, argv);
84
85    /* preset then read command line options */
86    options[SOURCEDIR].value="";
87    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
88
89    /* error handling, printing usage message */
90    if(argc<0) {
91        fprintf(stderr,
92            "error in command line argument \"%s\"\n",
93            argv[-argc]);
94    }
95    if(!options[OUTPUT_FILENAME].doesOccur) {
96        argc=-1;
97    }
98    if( argc<2 ||
99        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100    ) {
101        fprintf(stderr,
102            "Usage: %s [-options] infiles+ -o outputfilename\n"
103            "\n"
104            "Reads the infiles with normalization data and\n"
105            "creates a binary file, or a C source file (--csource), with the data,\n"
106            "or writes a data file with the combined data (--combined).\n"
107            "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
108            "\n"
109            "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
110            "\n"
111            "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112            "in input-file syntax to the outputfilename.\n"
113            "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114            "(Useful for computing minimal incremental mapping data files.)\n"
115            "\n",
116            argv[0], argv[0]);
117        fprintf(stderr,
118            "Options:\n"
119            "\t-h or -? or --help  this usage text\n"
120            "\t-v or --verbose     verbose output\n"
121            "\t-c or --copyright   include a copyright notice\n"
122            "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
123        fprintf(stderr,
124            "\t-s or --sourcedir   source directory, followed by the path\n"
125            "\t-o or --output      output filename\n"
126            "\t      --csource     writes a C source file with initializers\n"
127            "\t      --combined    writes a .txt file (input-file syntax) with the\n"
128            "\t                    combined data from all of the input files\n");
129        fprintf(stderr,
130            "\t      --fast        optimize the data for fast normalization,\n"
131            "\t                    which might increase its size  (Writes fully decomposed\n"
132            "\t                    regular mappings instead of delta mappings.\n"
133            "\t                    You should measure the runtime speed to make sure that\n"
134            "\t                    this is a good trade-off.)\n");
135        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
136    }
137
138    beVerbose=options[VERBOSE].doesOccur;
139    haveCopyright=options[COPYRIGHT].doesOccur;
140
141    IcuToolErrorCode errorCode("gennorm2/main()");
142
143#if UCONFIG_NO_NORMALIZATION
144
145    fprintf(stderr,
146        "gennorm2 writes a dummy binary data file "
147        "because UCONFIG_NO_NORMALIZATION is set, \n"
148        "see icu/source/common/unicode/uconfig.h\n");
149    udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
150    // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151    // return U_UNSUPPORTED_ERROR;
152    return 0;
153
154#else
155
156    LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
157    LocalPointer<Normalizer2DataBuilder> b2;
158    LocalPointer<Normalizer2DataBuilder> diff;
159    Normalizer2DataBuilder *builder = b1.getAlias();
160    errorCode.assertSuccess();
161
162    if(options[UNICODE_VERSION].doesOccur) {
163        builder->setUnicodeVersion(options[UNICODE_VERSION].value);
164    }
165
166    if(options[OPT_FAST].doesOccur) {
167        builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
168    }
169
170    // prepare the filename beginning with the source dir
171    CharString filename(options[SOURCEDIR].value, errorCode);
172    int32_t pathLength=filename.length();
173    if( pathLength>0 &&
174        filename[pathLength-1]!=U_FILE_SEP_CHAR &&
175        filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
176    ) {
177        filename.append(U_FILE_SEP_CHAR, errorCode);
178        pathLength=filename.length();
179    }
180
181    bool doMinus = false;
182    for(int i=1; i<argc; ++i) {
183        printf("gennorm2: processing %s\n", argv[i]);
184        if(strcmp(argv[i], "minus") == 0) {
185            if(doMinus) {
186                fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
187                exit(U_ILLEGAL_ARGUMENT_ERROR);
188            }
189            // Data from previous input files has been collected in b1.
190            // Collect data from further input files in b2.
191            b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
192            diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
193            errorCode.assertSuccess();
194            builder = b2.getAlias();
195            if(options[UNICODE_VERSION].doesOccur) {
196                builder->setUnicodeVersion(options[UNICODE_VERSION].value);
197            }
198            if(options[OPT_FAST].doesOccur) {
199                builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
200            }
201            doMinus = true;
202            continue;
203        }
204        filename.append(argv[i], errorCode);
205        std::ifstream f(filename.data());
206        if(f.fail()) {
207            fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
208            exit(U_FILE_ACCESS_ERROR);
209        }
210        builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
211        parseFile(f, *builder);
212        filename.truncate(pathLength);
213    }
214
215    if(doMinus) {
216        Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
217        diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
218    } else if(options[WRITE_COMBINED_DATA].doesOccur) {
219        builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
220    } else if(options[WRITE_C_SOURCE].doesOccur) {
221        builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
222    } else {
223        builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
224    }
225
226    return errorCode.get();
227
228#endif
229}
230
231#if !UCONFIG_NO_NORMALIZATION
232
233void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
234    IcuToolErrorCode errorCode("gennorm2/parseFile()");
235    std::string lineString;
236    uint32_t startCP, endCP;
237    while(std::getline(f, lineString)) {
238        if (lineString.empty()) {
239            continue;  // skip empty lines.
240        }
241#if (U_CPLUSPLUS_VERSION >= 11)
242        char *line = &lineString.front();
243#else
244        char *line = &lineString.at(0);
245#endif
246        char *comment=(char *)strchr(line, '#');
247        if(comment!=NULL) {
248            *comment=0;
249        }
250        u_rtrim(line);
251        if(line[0]==0) {
252            continue;  // skip empty and comment-only lines
253        }
254        if(line[0]=='*') {
255            const char *s=u_skipWhitespace(line+1);
256            if(0==strncmp(s, "Unicode", 7)) {
257                s=u_skipWhitespace(s+7);
258                builder.setUnicodeVersion(s);
259            }
260            continue;  // reserved syntax
261        }
262        const char *delimiter;
263        int32_t rangeLength=
264            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
265        if(errorCode.isFailure()) {
266            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
267            exit(errorCode.reset());
268        }
269        delimiter=u_skipWhitespace(delimiter);
270        if(*delimiter==':') {
271            const char *s=u_skipWhitespace(delimiter+1);
272            char *end;
273            unsigned long value=strtoul(s, &end, 10);
274            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
275                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
276                exit(U_PARSE_ERROR);
277            }
278            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
279                builder.setCC(c, (uint8_t)value);
280            }
281            continue;
282        }
283        if(*delimiter=='-') {
284            if(*u_skipWhitespace(delimiter+1)!=0) {
285                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
286                exit(U_PARSE_ERROR);
287            }
288            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
289                builder.removeMapping(c);
290            }
291            continue;
292        }
293        if(*delimiter=='=' || *delimiter=='>') {
294            UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
295            int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
296            if(errorCode.isFailure()) {
297                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
298                exit(errorCode.reset());
299            }
300            UnicodeString mapping(FALSE, uchars, length);
301            if(*delimiter=='=') {
302                if(rangeLength!=1) {
303                    fprintf(stderr,
304                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
305                            line);
306                    exit(U_PARSE_ERROR);
307                }
308                builder.setRoundTripMapping((UChar32)startCP, mapping);
309            } else {
310                for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
311                    builder.setOneWayMapping(c, mapping);
312                }
313            }
314            continue;
315        }
316        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
317        exit(U_PARSE_ERROR);
318    }
319}
320
321#endif // !UCONFIG_NO_NORMALIZATION
322
323U_NAMESPACE_END
324
325/*
326 * Hey, Emacs, please set the following:
327 *
328 * Local Variables:
329 * indent-tabs-mode: nil
330 * End:
331 *
332 */
333