1/*
2**********************************************************************
3*   Copyright (C) 2009-2015, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* File gencfu.c
8*/
9
10//--------------------------------------------------------------------
11//
12//   Tool for generating Unicode Confusable data files (.cfu files).
13//   .cfu files contain the compiled of the confusable data
14//   derived from the Unicode Consortium data described in
15//   Unicode UAX 39.
16//
17//   Usage:  gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt  -o output-file.cfu
18//
19//       options:   -v         verbose
20//                  -? or -h   help
21//
22//   The input rule filew is are plain text files containing confusable character
23//    definitions in the input format defined by Unicode UAX39 for the files
24//    confusables.txt and confusablesWholeScript.txt.  This source (.txt) format
25//    is also accepted direaccepted by ICU spoof detedtors.  The
26//    files must be encoded in utf-8 format, with or without a BOM.
27//
28//--------------------------------------------------------------------
29
30#include "unicode/utypes.h"
31#include "unicode/unistr.h"
32#include "unicode/uclean.h"
33#include "unicode/udata.h"
34#include "unicode/putil.h"
35
36#include "uoptions.h"
37#include "unewdata.h"
38#include "ucmndata.h"
39#include "uspoof_impl.h"
40#include "cmemory.h"
41
42#include <stdio.h>
43#include <stdlib.h>
44#include <string.h>
45
46U_NAMESPACE_USE
47
48static char *progName;
49static UOption options[]={
50    UOPTION_HELP_H,             /* 0 */
51    UOPTION_HELP_QUESTION_MARK, /* 1 */
52    UOPTION_VERBOSE,            /* 2 */
53    { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
54    { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0},  /* 4 */
55    { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 5 */
56    UOPTION_ICUDATADIR,         /* 6 */
57    UOPTION_DESTDIR,            /* 7 */
58    UOPTION_COPYRIGHT,          /* 8 */
59    UOPTION_QUIET,              /* 9 */
60};
61
62void usageAndDie(int retCode) {
63        printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
64        printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
65            "options:\n"
66            "\t-h or -? or --help  this usage text\n"
67            "\t-V or --version     show a version message\n"
68            "\t-c or --copyright   include a copyright notice\n"
69            "\t-v or --verbose     turn on verbose output\n"
70            "\t-q or --quiet       do not display warnings and progress\n"
71            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
72            "\t                    followed by path, defaults to %s\n"
73            "\t-d or --destdir     destination directory, followed by the path\n",
74            u_getDataDirectory());
75        exit (retCode);
76}
77
78
79#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
80
81/* dummy UDataInfo cf. udata.h */
82static UDataInfo dummyDataInfo = {
83    sizeof(UDataInfo),
84    0,
85
86    U_IS_BIG_ENDIAN,
87    U_CHARSET_FAMILY,
88    U_SIZEOF_UCHAR,
89    0,
90
91    { 0, 0, 0, 0 },                 /* dummy dataFormat */
92    { 0, 0, 0, 0 },                 /* dummy formatVersion */
93    { 0, 0, 0, 0 }                  /* dummy dataVersion */
94};
95
96#else
97
98//
99//  Set up the ICU data header, defined in ucmndata.h
100//
101DataHeader dh ={
102    {sizeof(DataHeader),           // Struct MappedData
103        0xda,
104        0x27},
105
106    {                               // struct UDataInfo
107        sizeof(UDataInfo),          //     size
108        0,                          //     reserved
109        U_IS_BIG_ENDIAN,
110        U_CHARSET_FAMILY,
111        U_SIZEOF_UCHAR,
112        0,                          //     reserved
113
114    { 0x43, 0x66, 0x75, 0x20 },     //     dataFormat="Cfu "
115    { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
116                                    //      from the  builder.  The  values declared
117                                    //      here should never appear in any real data.
118        { 5, 1, 0, 0 }              //   dataVersion (Unicode version)
119    }};
120
121#endif
122
123// Forward declaration for function for reading source files.
124static const char *readFile(const char *fileName, int32_t *len);
125
126//----------------------------------------------------------------------------
127//
128//  main      for gencfu
129//
130//----------------------------------------------------------------------------
131int  main(int argc, char **argv) {
132    UErrorCode  status = U_ZERO_ERROR;
133    const char *confFileName;
134    const char *confWSFileName;
135    const char *outFileName;
136    const char *outDir = NULL;
137    const char *copyright = NULL;
138
139    //
140    // Pick up and check the command line arguments,
141    //    using the standard ICU tool utils option handling.
142    //
143    U_MAIN_INIT_ARGS(argc, argv);
144    progName = argv[0];
145    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
146    if(argc<0) {
147        // Unrecognized option
148        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
149        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
150    }
151
152    if(options[0].doesOccur || options[1].doesOccur) {
153        //  -? or -h for help.
154        usageAndDie(0);
155    }
156
157    if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
158        fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
159        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
160    }
161    confFileName   = options[3].value;
162    confWSFileName = options[4].value;
163    outFileName    = options[5].value;
164
165    if (options[6].doesOccur) {
166        u_setDataDirectory(options[6].value);
167    }
168
169    status = U_ZERO_ERROR;
170
171    /* Combine the directory with the file name */
172    if(options[7].doesOccur) {
173        outDir = options[7].value;
174    }
175    if (options[8].doesOccur) {
176        copyright = U_COPYRIGHT_STRING;
177    }
178
179    UBool quiet = FALSE;
180    if (options[9].doesOccur) {
181      quiet = TRUE;
182    }
183
184#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
185    // spoof detection data file parsing is dependent on regular expressions.
186    // TODO: have the tool return an error status.  Requires fixing the ICU data build
187    //       so that it doesn't abort entirely on that error.
188
189    UNewDataMemory *pData;
190    char msg[1024];
191
192    /* write message with just the name */
193    sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
194    fprintf(stderr, "%s\n", msg);
195
196    /* write the dummy data file */
197    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
198    udata_writeBlock(pData, msg, strlen(msg));
199    udata_finish(pData, &status);
200    return (int)status;
201
202#else
203    /* Initialize ICU */
204    u_init(&status);
205    if (U_FAILURE(status)) {
206        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
207            argv[0], u_errorName(status));
208        exit(1);
209    }
210    status = U_ZERO_ERROR;
211
212    //  Read in the confusables source file
213
214    int32_t      confusablesLen = 0;
215    const char  *confusables = readFile(confFileName, &confusablesLen);
216    if (confusables == NULL) {
217        printf("gencfu: error reading file  \"%s\"\n", confFileName);
218        exit(-1);
219    }
220
221    int32_t     wsConfusablesLen = 0;
222    const char *wsConfsables =  readFile(confWSFileName, &wsConfusablesLen);
223    if (wsConfsables == NULL) {
224        printf("gencfu: error reading file  \"%s\"\n", confFileName);
225        exit(-1);
226    }
227
228    //
229    //  Create the Spoof Detector from the source confusables files.
230    //     This will compile the data.
231    //
232    UParseError parseError;
233    parseError.line = 0;
234    parseError.offset = 0;
235    int32_t errType;
236    USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
237                                              wsConfsables, wsConfusablesLen,
238                                              &errType, &parseError, &status);
239    if (U_FAILURE(status)) {
240        const char *errFile =
241            (errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
242        fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\"  at file %s, line %d, column %d\n",
243                u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
244        exit(status);
245    };
246
247
248    //
249    //  Get the compiled rule data from the USpoofChecker.
250    //
251    uint32_t        outDataSize;
252    uint8_t        *outData;
253    outDataSize = uspoof_serialize(sc, NULL, 0, &status);
254    if (status != U_BUFFER_OVERFLOW_ERROR) {
255        fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
256        exit(status);
257    }
258    status = U_ZERO_ERROR;
259    outData = new uint8_t[outDataSize];
260    uspoof_serialize(sc, outData, outDataSize, &status);
261
262    // Copy the data format version numbers from the spoof data header into the UDataMemory header.
263
264    uprv_memcpy(dh.info.formatVersion,
265                reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
266                sizeof(dh.info.formatVersion));
267
268    //
269    //  Create the output file
270    //
271    size_t bytesWritten;
272    UNewDataMemory *pData;
273    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
274    if(U_FAILURE(status)) {
275        fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
276                         outFileName, u_errorName(status));
277        exit(status);
278    }
279
280
281    //  Write the data itself.
282    udata_writeBlock(pData, outData, outDataSize);
283    // finish up
284    bytesWritten = udata_finish(pData, &status);
285    if(U_FAILURE(status)) {
286        fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
287        exit(status);
288    }
289
290    if (bytesWritten != outDataSize) {
291        fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
292        exit(-1);
293    }
294
295    uspoof_close(sc);
296    delete [] outData;
297    delete [] confusables;
298    delete [] wsConfsables;
299    u_cleanup();
300    if (!quiet) {
301        printf("gencfu: tool completed successfully.\n");
302    }
303    return 0;
304#endif   // UCONFIG_NO_REGULAR_EXPRESSIONS
305}
306
307
308 //
309 //  Read in a confusables source file
310 //
311 static const char *readFile(const char *fileName, int32_t *len) {
312    char       *result;
313    long        fileSize;
314    FILE        *file;
315
316    file = fopen(fileName, "rb");
317    if( file == 0 ) {
318        return NULL;
319    }
320    fseek(file, 0, SEEK_END);
321    fileSize = ftell(file);
322    fseek(file, 0, SEEK_SET);
323    result = new char[fileSize+10];
324    if (result==NULL) {
325        fclose(file);
326        return NULL;
327    }
328
329    long t = fread(result, 1, fileSize, file);
330    if (t != fileSize)  {
331        delete [] result;
332        fclose(file);
333        return NULL;
334    }
335    result[fileSize]=0;
336    *len = static_cast<int32_t>(fileSize);
337    fclose(file);
338    return result;
339 }
340