1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2014, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  gennorm2.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009nov25
14*   created by: Markus W. Scherer
15*
16*   This program reads text files that define Unicode normalization,
17*   parses them, and builds a binary data file.
18*/
19
20#include "unicode/utypes.h"
21#include "n2builder.h"
22
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include "unicode/errorcode.h"
27#include "unicode/localpointer.h"
28#include "unicode/putil.h"
29#include "unicode/uchar.h"
30#include "unicode/unistr.h"
31#include "charstr.h"
32#include "normalizer2impl.h"
33#include "toolutil.h"
34#include "uoptions.h"
35#include "uparse.h"
36
37#if UCONFIG_NO_NORMALIZATION
38#include "unewdata.h"
39#endif
40
41U_NAMESPACE_BEGIN
42
43UBool beVerbose=FALSE, haveCopyright=TRUE;
44
45U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
46
47#if !UCONFIG_NO_NORMALIZATION
48void parseFile(FILE *f, Normalizer2DataBuilder &builder);
49#endif
50
51/* -------------------------------------------------------------------------- */
52
53enum {
54    HELP_H,
55    HELP_QUESTION_MARK,
56    VERBOSE,
57    COPYRIGHT,
58    SOURCEDIR,
59    OUTPUT_FILENAME,
60    UNICODE_VERSION,
61    WRITE_C_SOURCE,
62    OPT_FAST
63};
64
65static UOption options[]={
66    UOPTION_HELP_H,
67    UOPTION_HELP_QUESTION_MARK,
68    UOPTION_VERBOSE,
69    UOPTION_COPYRIGHT,
70    UOPTION_SOURCEDIR,
71    UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
72    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
73    UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
74    UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75};
76
77extern "C" int
78main(int argc, char* argv[]) {
79    U_MAIN_INIT_ARGS(argc, argv);
80
81    /* preset then read command line options */
82    options[SOURCEDIR].value="";
83    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
84
85    /* error handling, printing usage message */
86    if(argc<0) {
87        fprintf(stderr,
88            "error in command line argument \"%s\"\n",
89            argv[-argc]);
90    }
91    if(!options[OUTPUT_FILENAME].doesOccur) {
92        argc=-1;
93    }
94    if( argc<2 ||
95        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
96    ) {
97        /*
98         * Broken into chunks because the C89 standard says the minimum
99         * required supported string length is 509 bytes.
100         */
101        fprintf(stderr,
102            "Usage: %s [-options] infiles+ -o outputfilename\n"
103            "\n"
104            "Reads the infiles with normalization data and\n"
105            "creates a binary or C source file (outputfilename) with the data.\n"
106            "\n",
107            argv[0]);
108        fprintf(stderr,
109            "Options:\n"
110            "\t-h or -? or --help  this usage text\n"
111            "\t-v or --verbose     verbose output\n"
112            "\t-c or --copyright   include a copyright notice\n"
113            "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
114        fprintf(stderr,
115            "\t-s or --sourcedir   source directory, followed by the path\n"
116            "\t-o or --output      output filename\n"
117            "\t      --csource     writes a C source file with initializers\n");
118        fprintf(stderr,
119            "\t      --fast        optimize the data for fast normalization,\n"
120            "\t                    which might increase its size  (Writes fully decomposed\n"
121            "\t                    regular mappings instead of delta mappings.\n"
122            "\t                    You should measure the runtime speed to make sure that\n"
123            "\t                    this is a good trade-off.)\n");
124        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
125    }
126
127    beVerbose=options[VERBOSE].doesOccur;
128    haveCopyright=options[COPYRIGHT].doesOccur;
129
130    IcuToolErrorCode errorCode("gennorm2/main()");
131
132#if UCONFIG_NO_NORMALIZATION
133
134    fprintf(stderr,
135        "gennorm2 writes a dummy binary data file "
136        "because UCONFIG_NO_NORMALIZATION is set, \n"
137        "see icu/source/common/unicode/uconfig.h\n");
138    udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
139    // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140    // return U_UNSUPPORTED_ERROR;
141    return 0;
142
143#else
144
145    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
146    errorCode.assertSuccess();
147
148    if(options[UNICODE_VERSION].doesOccur) {
149        builder->setUnicodeVersion(options[UNICODE_VERSION].value);
150    }
151
152    if(options[OPT_FAST].doesOccur) {
153        builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
154    }
155
156    // prepare the filename beginning with the source dir
157    CharString filename(options[SOURCEDIR].value, errorCode);
158    int32_t pathLength=filename.length();
159    if( pathLength>0 &&
160        filename[pathLength-1]!=U_FILE_SEP_CHAR &&
161        filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
162    ) {
163        filename.append(U_FILE_SEP_CHAR, errorCode);
164        pathLength=filename.length();
165    }
166
167    for(int i=1; i<argc; ++i) {
168        printf("gennorm2: processing %s\n", argv[i]);
169        filename.append(argv[i], errorCode);
170        LocalStdioFilePointer f(fopen(filename.data(), "r"));
171        if(f==NULL) {
172            fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
173            exit(U_FILE_ACCESS_ERROR);
174        }
175        builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
176        parseFile(f.getAlias(), *builder);
177        filename.truncate(pathLength);
178    }
179
180    if(options[WRITE_C_SOURCE].doesOccur) {
181        builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
182    } else {
183        builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
184    }
185
186    return errorCode.get();
187
188#endif
189}
190
191#if !UCONFIG_NO_NORMALIZATION
192
193void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
194    IcuToolErrorCode errorCode("gennorm2/parseFile()");
195    char line[300];
196    uint32_t startCP, endCP;
197    while(NULL!=fgets(line, (int)sizeof(line), f)) {
198        char *comment=(char *)strchr(line, '#');
199        if(comment!=NULL) {
200            *comment=0;
201        }
202        u_rtrim(line);
203        if(line[0]==0) {
204            continue;  // skip empty and comment-only lines
205        }
206        if(line[0]=='*') {
207            const char *s=u_skipWhitespace(line+1);
208            if(0==strncmp(s, "Unicode", 7)) {
209                s=u_skipWhitespace(s+7);
210                builder.setUnicodeVersion(s);
211            }
212            continue;  // reserved syntax
213        }
214        const char *delimiter;
215        int32_t rangeLength=
216            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
217        if(errorCode.isFailure()) {
218            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
219            exit(errorCode.reset());
220        }
221        delimiter=u_skipWhitespace(delimiter);
222        if(*delimiter==':') {
223            const char *s=u_skipWhitespace(delimiter+1);
224            char *end;
225            unsigned long value=strtoul(s, &end, 10);
226            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
227                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
228                exit(U_PARSE_ERROR);
229            }
230            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
231                builder.setCC(c, (uint8_t)value);
232            }
233            continue;
234        }
235        if(*delimiter=='-') {
236            if(*u_skipWhitespace(delimiter+1)!=0) {
237                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
238                exit(U_PARSE_ERROR);
239            }
240            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
241                builder.removeMapping(c);
242            }
243            continue;
244        }
245        if(*delimiter=='=' || *delimiter=='>') {
246            UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
247            int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
248            if(errorCode.isFailure()) {
249                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
250                exit(errorCode.reset());
251            }
252            UnicodeString mapping(FALSE, uchars, length);
253            if(*delimiter=='=') {
254                if(rangeLength!=1) {
255                    fprintf(stderr,
256                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
257                            line);
258                    exit(U_PARSE_ERROR);
259                }
260                builder.setRoundTripMapping((UChar32)startCP, mapping);
261            } else {
262                for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
263                    builder.setOneWayMapping(c, mapping);
264                }
265            }
266            continue;
267        }
268        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
269        exit(U_PARSE_ERROR);
270    }
271}
272
273#endif // !UCONFIG_NO_NORMALIZATION
274
275U_NAMESPACE_END
276
277/*
278 * Hey, Emacs, please set the following:
279 *
280 * Local Variables:
281 * indent-tabs-mode: nil
282 * End:
283 *
284 */
285