1/*
2**********************************************************************
3*   Copyright (C) 2002-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* File gendict.cpp
8*/
9
10#include "unicode/utypes.h"
11#include "unicode/uchar.h"
12#include "unicode/ucnv.h"
13#include "unicode/uniset.h"
14#include "unicode/unistr.h"
15#include "unicode/uclean.h"
16#include "unicode/udata.h"
17#include "unicode/putil.h"
18#include "unicode/ucharstriebuilder.h"
19#include "unicode/bytestriebuilder.h"
20#include "unicode/ucharstrie.h"
21#include "unicode/bytestrie.h"
22#include "unicode/ucnv.h"
23#include "unicode/utf16.h"
24
25#include "charstr.h"
26#include "dictionarydata.h"
27#include "uoptions.h"
28#include "unewdata.h"
29#include "cmemory.h"
30#include "uassert.h"
31#include "ucbuf.h"
32#include "toolutil.h"
33#include "cstring.h"
34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38
39#include "putilimp.h"
40UDate startTime = -1.0;
41
42static int elapsedTime() {
43  return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0);
44}
45
46#if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API
47#include <signal.h>
48#include <unistd.h>
49
50const char *wToolname="gendict";
51const char *wOutname="(some file)";
52
53const int firstSeconds = 5; /* seconds between notices*/
54const int nextSeconds = 15; /* seconds between notices*/
55
56static void alarm_fn(int /*n*/) {
57  printf("%s: still writing\t%s (%ds)\t...\n",    wToolname, wOutname, elapsedTime());
58
59  signal(SIGALRM, &alarm_fn);
60  alarm(nextSeconds); // reset the alarm
61}
62
63static void install_watchdog(const char *toolName, const char *outFileName) {
64  wToolname=toolName;
65  wOutname=outFileName;
66
67  if(startTime<0) { // uninitialized
68    startTime = uprv_getRawUTCtime();
69  }
70  signal(SIGALRM, &alarm_fn);
71
72  alarm(firstSeconds); // set the alarm
73}
74
75#else
76static void install_watchdog(const char*, const char*) {
77  // not implemented
78}
79#endif
80
81
82
83
84U_NAMESPACE_USE
85
86static char *progName;
87static UOption options[]={
88    UOPTION_HELP_H,             /* 0 */
89    UOPTION_HELP_QUESTION_MARK, /* 1 */
90    UOPTION_VERBOSE,            /* 2 */
91    UOPTION_ICUDATADIR,         /* 4 */
92    UOPTION_COPYRIGHT,          /* 5 */
93    { "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
94    { "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
95    { "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
96};
97
98enum arguments {
99    ARG_HELP = 0,
100    ARG_QMARK,
101    ARG_VERBOSE,
102    ARG_ICUDATADIR,
103    ARG_COPYRIGHT,
104    ARG_UCHARS,
105    ARG_BYTES,
106    ARG_TRANSFORM
107};
108
109// prints out the standard usage method describing command line arguments,
110// then bails out with the desired exit code
111static void usageAndDie(UErrorCode retCode) {
112    fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
113    fprintf((U_SUCCESS(retCode) ? stdout : stderr),
114           "\tRead in a word list and write out a string trie dictionary\n"
115           "options:\n"
116           "\t-h or -? or --help  this usage text\n"
117           "\t-V or --version     show a version message\n"
118           "\t-c or --copyright   include a copyright notice\n"
119           "\t-v or --verbose     turn on verbose output\n"
120           "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
121           "\t                    followed by path, defaults to %s\n"
122           "\t--uchars            output a UCharsTrie (mutually exclusive with -b!)\n"
123           "\t--bytes             output a BytesTrie (mutually exclusive with -u!)\n"
124           "\t--transform         the kind of transform to use (eg --transform offset-40A3,\n"
125           "\t                    which specifies an offset transform with constant 0x40A3)\n",
126            u_getDataDirectory());
127    exit(retCode);
128}
129
130
131/* UDataInfo cf. udata.h */
132static UDataInfo dataInfo = {
133    sizeof(UDataInfo),
134    0,
135
136    U_IS_BIG_ENDIAN,
137    U_CHARSET_FAMILY,
138    U_SIZEOF_UCHAR,
139    0,
140
141    { 0x44, 0x69, 0x63, 0x74 },     /* "Dict" */
142    { 1, 0, 0, 0 },                 /* format version */
143    { 0, 0, 0, 0 }                  /* data version */
144};
145
146#if !UCONFIG_NO_BREAK_ITERATION
147
148// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
149// may want to put this somewhere in ICU, as it could be useful outside
150// of this tool?
151class DataDict {
152private:
153    BytesTrieBuilder *bt;
154    UCharsTrieBuilder *ut;
155    UChar32 transformConstant;
156    int32_t transformType;
157public:
158    // constructs a new data dictionary. if there is an error,
159    // it will be returned in status
160    // isBytesTrie != 0 will produce a BytesTrieBuilder,
161    // isBytesTrie == 0 will produce a UCharsTrieBuilder
162    DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL),
163        transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
164        if (isBytesTrie) {
165            bt = new BytesTrieBuilder(status);
166        } else {
167            ut = new UCharsTrieBuilder(status);
168        }
169    }
170
171    ~DataDict() {
172        delete bt;
173        delete ut;
174    }
175
176private:
177    char transform(UChar32 c, UErrorCode &status) {
178        if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
179            if (c == 0x200D) { return (char)0xFF; }
180            else if (c == 0x200C) { return (char)0xFE; }
181            int32_t delta = c - transformConstant;
182            if (delta < 0 || 0xFD < delta) {
183                fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
184                        (long)c, (long)transformConstant);
185                exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
186            }
187            return (char)delta;
188        } else { // no such transform type
189            status = U_INTERNAL_PROGRAM_ERROR;
190            return (char)c; // it should be noted this transform type will not generally work
191        }
192    }
193
194    void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
195        UChar32 c = 0;
196        int32_t len = word.length();
197        for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
198            c = word.char32At(i);
199            buf.append(transform(c, errorCode), errorCode);
200        }
201    }
202
203public:
204    // sets the desired transformation data.
205    // should be populated from a command line argument
206    // so far the only acceptable format is offset-<hex constant>
207    // eventually others (mask-<hex constant>?) may be enabled
208    // more complex functions may be more difficult
209    void setTransform(const char *t) {
210        if (strncmp(t, "offset-", 7) == 0) {
211            char *end;
212            unsigned long base = uprv_strtoul(t + 7, &end, 16);
213            if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
214                fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
215                usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
216            }
217            transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
218            transformConstant = (UChar32)base;
219        }
220        else {
221            fprintf(stderr, "Invalid transform specified: %s\n", t);
222            usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
223        }
224    }
225
226    // add a word to the trie
227    void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
228        if (bt) {
229            CharString buf;
230            transform(word, buf, status);
231            bt->add(buf.toStringPiece(), value, status);
232        }
233        if (ut) { ut->add(word, value, status); }
234    }
235
236    // if we are a bytestrie, give back the StringPiece representing the serialized version of us
237    StringPiece serializeBytes(UErrorCode &status) {
238        return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
239    }
240
241    // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
242    void serializeUChars(UnicodeString &s, UErrorCode &status) {
243        ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
244    }
245
246    int32_t getTransform() {
247        return (int32_t)(transformType | transformConstant);
248    }
249};
250#endif
251
252static const UChar LINEFEED_CHARACTER = 0x000A;
253static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;
254
255static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
256    int32_t lineLength;
257    const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
258    if(line == NULL || errorCode.isFailure()) { return FALSE; }
259    // Strip trailing CR/LF, comments, and spaces.
260    const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
261    if(comment != NULL) {
262        lineLength = (int32_t)(comment - line);
263    } else {
264        while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
265    }
266    while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
267    fileLine.setTo(FALSE, line, lineLength);
268    return TRUE;
269}
270
271//----------------------------------------------------------------------------
272//
273//  main      for gendict
274//
275//----------------------------------------------------------------------------
276int  main(int argc, char **argv) {
277    //
278    // Pick up and check the command line arguments,
279    //    using the standard ICU tool utils option handling.
280    //
281    U_MAIN_INIT_ARGS(argc, argv);
282    progName = argv[0];
283    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
284    if(argc<0) {
285        // Unrecognized option
286        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
287        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
288    }
289
290    if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) {
291        //  -? or -h for help.
292        usageAndDie(U_ZERO_ERROR);
293    }
294
295    UBool verbose = options[ARG_VERBOSE].doesOccur;
296
297    if (argc < 3) {
298        fprintf(stderr, "input and output file must both be specified.\n");
299        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
300    }
301    const char *outFileName  = argv[2];
302    const char *wordFileName = argv[1];
303
304    // set up the watchdog
305    install_watchdog(progName, outFileName);
306
307    if (options[ARG_ICUDATADIR].doesOccur) {
308        u_setDataDirectory(options[ARG_ICUDATADIR].value);
309    }
310
311    const char *copyright = NULL;
312    if (options[ARG_COPYRIGHT].doesOccur) {
313        copyright = U_COPYRIGHT_STRING;
314    }
315
316    if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
317        fprintf(stderr, "you must specify exactly one type of trie to output!\n");
318        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
319    }
320    UBool isBytesTrie = options[ARG_BYTES].doesOccur;
321    if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
322        fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
323        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
324    }
325
326    IcuToolErrorCode status("gendict/main()");
327
328#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
329    const char* outDir=NULL;
330
331    UNewDataMemory *pData;
332    char msg[1024];
333    UErrorCode tempstatus = U_ZERO_ERROR;
334
335    /* write message with just the name */ // potential for a buffer overflow here...
336    sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
337    fprintf(stderr, "%s\n", msg);
338
339    /* write the dummy data file */
340    pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus);
341    udata_writeBlock(pData, msg, strlen(msg));
342    udata_finish(pData, &tempstatus);
343    return (int)tempstatus;
344
345#else
346    //  Read in the dictionary source file
347    if (verbose) { printf("Opening file %s...\n", wordFileName); }
348    const char *codepage = "UTF-8";
349    UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status);
350    if (status.isFailure()) {
351        fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
352        exit(status.reset());
353    }
354    if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
355    DataDict dict(isBytesTrie, status);
356    if (status.isFailure()) {
357        fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
358        exit(status.reset());
359    }
360    if (options[ARG_TRANSFORM].doesOccur) {
361        dict.setTransform(options[ARG_TRANSFORM].value);
362    }
363
364    UnicodeString fileLine;
365    if (verbose) { puts("Adding words to dictionary..."); }
366    UBool hasValues = FALSE;
367    UBool hasValuelessContents = FALSE;
368    int lineCount = 0;
369    UBool isOk = TRUE;
370    while (readLine(f, fileLine, status)) {
371        lineCount++;
372        if (fileLine.isEmpty()) continue;
373
374        // Parse word [spaces value].
375        int32_t keyLen;
376        for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
377        if (keyLen == 0) {
378            fprintf(stderr, "Error: no word on line %i!\n", lineCount);
379            isOk = FALSE;
380            continue;
381        }
382        int32_t valueStart;
383        for (valueStart = keyLen;
384            valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
385            ++valueStart) {}
386
387        if (keyLen < valueStart) {
388            int32_t valueLength = fileLine.length() - valueStart;
389            if (valueLength > 15) {
390                fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
391                isOk = FALSE;
392                continue;
393            }
394            char s[16];
395            fileLine.extract(valueStart, valueLength, s, 16, US_INV);
396            char *end;
397            unsigned long value = uprv_strtoul(s, &end, 0);
398            if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) {
399                fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
400                isOk = FALSE;
401                continue;
402            }
403            dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
404            hasValues = TRUE;
405        } else {
406            dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
407            hasValuelessContents = FALSE;
408        }
409
410        if (status.isFailure()) {
411            fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
412                status.errorName(), lineCount);
413            exit(status.reset());
414        }
415    }
416
417    if (!isOk && status.isSuccess()) {
418        status.set(U_ILLEGAL_ARGUMENT_ERROR);
419    }
420    if (hasValues && hasValuelessContents) {
421        fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
422    }
423
424    if (verbose) { puts("Serializing data..."); }
425    int32_t outDataSize;
426    const void *outData;
427    UnicodeString usp;
428    if (isBytesTrie) {
429        StringPiece sp = dict.serializeBytes(status);
430        outDataSize = sp.size();
431        outData = sp.data();
432    } else {
433        dict.serializeUChars(usp, status);
434        outDataSize = usp.length() * U_SIZEOF_UCHAR;
435        outData = usp.getBuffer();
436    }
437    if (status.isFailure()) {
438        fprintf(stderr, "gendict: got failure of type %s while serializing\n", status.errorName());
439        exit(status.reset());
440    }
441    if (verbose) { puts("Opening output file..."); }
442    UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
443    if (status.isFailure()) {
444        fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
445        exit(status.reset());
446    }
447
448    if (verbose) { puts("Writing to output file..."); }
449    int32_t indexes[DictionaryData::IX_COUNT] = {
450        DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
451    };
452    int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
453    indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
454    indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
455    indexes[DictionaryData::IX_TOTAL_SIZE] = size;
456
457    indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
458    if (hasValues) {
459        indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
460    }
461
462    indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
463    udata_writeBlock(pData, indexes, sizeof(indexes));
464    udata_writeBlock(pData, outData, outDataSize);
465    size_t bytesWritten = udata_finish(pData, status);
466    if (status.isFailure()) {
467        fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
468        exit(status.reset());
469    }
470
471    if (bytesWritten != (size_t)size) {
472        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
473        exit(U_INTERNAL_PROGRAM_ERROR);
474    }
475
476    printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime());
477
478#ifdef TEST_GENDICT
479    if (isBytesTrie) {
480        BytesTrie::Iterator it(outData, outDataSize, status);
481        while (it.hasNext()) {
482            it.next(status);
483            const StringPiece s = it.getString();
484            int32_t val = it.getValue();
485            printf("%s -> %i\n", s.data(), val);
486        }
487    } else {
488        UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
489        while (it.hasNext()) {
490            it.next(status);
491            const UnicodeString s = it.getString();
492            int32_t val = it.getValue();
493            char tmp[1024];
494            s.extract(0, s.length(), tmp, 1024);
495            printf("%s -> %i\n", tmp, val);
496        }
497    }
498#endif
499
500    return 0;
501#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
502}
503