1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  gennorm2.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009nov25
14*   created by: Markus W. Scherer
15*
16*   This program reads text files that define Unicode normalization,
17*   parses them, and builds a binary data file.
18*/
19
20#include "unicode/utypes.h"
21#include "n2builder.h"
22
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include "unicode/errorcode.h"
27#include "unicode/localpointer.h"
28#include "unicode/putil.h"
29#include "unicode/uchar.h"
30#include "unicode/unistr.h"
31#include "charstr.h"
32#include "normalizer2impl.h"
33#include "toolutil.h"
34#include "uoptions.h"
35#include "uparse.h"
36
37#if UCONFIG_NO_NORMALIZATION
38#include "unewdata.h"
39#endif
40
41#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42
43U_NAMESPACE_BEGIN
44
45UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48
49#if !UCONFIG_NO_NORMALIZATION
50void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51#endif
52
53/* -------------------------------------------------------------------------- */
54
55enum {
56    HELP_H,
57    HELP_QUESTION_MARK,
58    VERBOSE,
59    COPYRIGHT,
60    SOURCEDIR,
61    OUTPUT_FILENAME,
62    UNICODE_VERSION,
63    OPT_FAST
64};
65
66static UOption options[]={
67    UOPTION_HELP_H,
68    UOPTION_HELP_QUESTION_MARK,
69    UOPTION_VERBOSE,
70    UOPTION_COPYRIGHT,
71    UOPTION_SOURCEDIR,
72    UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
73    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
74    UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75};
76
77extern "C" int
78main(int argc, char* argv[]) {
79    U_MAIN_INIT_ARGS(argc, argv);
80
81    /* preset then read command line options */
82    options[SOURCEDIR].value="";
83    options[UNICODE_VERSION].value=U_UNICODE_VERSION;
84    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
85
86    /* error handling, printing usage message */
87    if(argc<0) {
88        fprintf(stderr,
89            "error in command line argument \"%s\"\n",
90            argv[-argc]);
91    }
92    if(!options[OUTPUT_FILENAME].doesOccur) {
93        argc=-1;
94    }
95    if( argc<2 ||
96        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
97    ) {
98        /*
99         * Broken into chunks because the C89 standard says the minimum
100         * required supported string length is 509 bytes.
101         */
102        fprintf(stderr,
103            "Usage: %s [-options] infiles+ -o outputfilename\n"
104            "\n"
105            "Reads the infiles with normalization data and\n"
106            "creates a binary file (outputfilename) with the data.\n"
107            "\n",
108            argv[0]);
109        fprintf(stderr,
110            "Options:\n"
111            "\t-h or -? or --help  this usage text\n"
112            "\t-v or --verbose     verbose output\n"
113            "\t-c or --copyright   include a copyright notice\n"
114            "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
115        fprintf(stderr,
116            "\t-s or --sourcedir   source directory, followed by the path\n"
117            "\t-o or --output      output filename\n");
118        fprintf(stderr,
119            "\t      --fast        optimize the .nrm file for fast normalization,\n"
120            "\t                    which might increase its size  (Writes fully decomposed\n"
121            "\t                    regular mappings instead of delta mappings.\n"
122            "\t                    You should measure the runtime speed to make sure that\n"
123            "\t                    this is a good trade-off.)\n");
124        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
125    }
126
127    beVerbose=options[VERBOSE].doesOccur;
128    haveCopyright=options[COPYRIGHT].doesOccur;
129
130    IcuToolErrorCode errorCode("gennorm2/main()");
131
132#if UCONFIG_NO_NORMALIZATION
133
134    fprintf(stderr,
135        "gennorm2 writes a dummy binary data file "
136        "because UCONFIG_NO_NORMALIZATION is set, \n"
137        "see icu/source/common/unicode/uconfig.h\n");
138    udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
139    // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140    // return U_UNSUPPORTED_ERROR;
141    return 0;
142
143#else
144
145    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
146    errorCode.assertSuccess();
147
148    builder->setUnicodeVersion(options[UNICODE_VERSION].value);
149
150    if(options[OPT_FAST].doesOccur) {
151        builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
152    }
153
154    // prepare the filename beginning with the source dir
155    CharString filename(options[SOURCEDIR].value, errorCode);
156    int32_t pathLength=filename.length();
157    if( pathLength>0 &&
158        filename[pathLength-1]!=U_FILE_SEP_CHAR &&
159        filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
160    ) {
161        filename.append(U_FILE_SEP_CHAR, errorCode);
162        pathLength=filename.length();
163    }
164
165    for(int i=1; i<argc; ++i) {
166        printf("gennorm2: processing %s\n", argv[i]);
167        filename.append(argv[i], errorCode);
168        LocalStdioFilePointer f(fopen(filename.data(), "r"));
169        if(f==NULL) {
170            fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
171            exit(U_FILE_ACCESS_ERROR);
172        }
173        builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
174        parseFile(f.getAlias(), *builder);
175        filename.truncate(pathLength);
176    }
177
178    builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
179
180    return errorCode.get();
181
182#endif
183}
184
185#if !UCONFIG_NO_NORMALIZATION
186
187void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
188    IcuToolErrorCode errorCode("gennorm2/parseFile()");
189    char line[300];
190    uint32_t startCP, endCP;
191    while(NULL!=fgets(line, (int)sizeof(line), f)) {
192        char *comment=(char *)strchr(line, '#');
193        if(comment!=NULL) {
194            *comment=0;
195        }
196        u_rtrim(line);
197        if(line[0]==0) {
198            continue;  // skip empty and comment-only lines
199        }
200        if(line[0]=='*') {
201            continue;  // reserved syntax
202        }
203        const char *delimiter;
204        int32_t rangeLength=
205            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
206        if(errorCode.isFailure()) {
207            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
208            exit(errorCode.reset());
209        }
210        delimiter=u_skipWhitespace(delimiter);
211        if(*delimiter==':') {
212            const char *s=u_skipWhitespace(delimiter+1);
213            char *end;
214            unsigned long value=strtoul(s, &end, 10);
215            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
216                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
217                exit(U_PARSE_ERROR);
218            }
219            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
220                builder.setCC(c, (uint8_t)value);
221            }
222            continue;
223        }
224        if(*delimiter=='-') {
225            if(*u_skipWhitespace(delimiter+1)!=0) {
226                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
227                exit(U_PARSE_ERROR);
228            }
229            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
230                builder.removeMapping(c);
231            }
232            continue;
233        }
234        if(*delimiter=='=' || *delimiter=='>') {
235            UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
236            int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
237            if(errorCode.isFailure()) {
238                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
239                exit(errorCode.reset());
240            }
241            UnicodeString mapping(FALSE, uchars, length);
242            if(*delimiter=='=') {
243                if(rangeLength!=1) {
244                    fprintf(stderr,
245                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
246                            line);
247                    exit(U_PARSE_ERROR);
248                }
249                builder.setRoundTripMapping((UChar32)startCP, mapping);
250            } else {
251                for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
252                    builder.setOneWayMapping(c, mapping);
253                }
254            }
255            continue;
256        }
257        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
258        exit(U_PARSE_ERROR);
259    }
260}
261
262#endif // !UCONFIG_NO_NORMALIZATION
263
264U_NAMESPACE_END
265
266/*
267 * Hey, Emacs, please set the following:
268 *
269 * Local Variables:
270 * indent-tabs-mode: nil
271 * End:
272 *
273 */
274