1/* 2********************************************************************** 3* Copyright (C) 2002-2013, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* 7* File gendict.cpp 8*/ 9 10#include "unicode/utypes.h" 11#include "unicode/uchar.h" 12#include "unicode/ucnv.h" 13#include "unicode/uniset.h" 14#include "unicode/unistr.h" 15#include "unicode/uclean.h" 16#include "unicode/udata.h" 17#include "unicode/putil.h" 18#include "unicode/ucharstriebuilder.h" 19#include "unicode/bytestriebuilder.h" 20#include "unicode/ucharstrie.h" 21#include "unicode/bytestrie.h" 22#include "unicode/ucnv.h" 23#include "unicode/utf16.h" 24 25#include "charstr.h" 26#include "dictionarydata.h" 27#include "uoptions.h" 28#include "unewdata.h" 29#include "cmemory.h" 30#include "uassert.h" 31#include "ucbuf.h" 32#include "toolutil.h" 33#include "cstring.h" 34 35#include <stdio.h> 36#include <stdlib.h> 37#include <string.h> 38 39#include "putilimp.h" 40UDate startTime; 41 42static int elapsedTime() { 43 return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0); 44} 45 46#if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API 47 48#include <signal.h> 49#include <unistd.h> 50 51const char *wToolname="gendict"; 52const char *wOutname="(some file)"; 53 54const int firstSeconds = 5; /* seconds between notices*/ 55const int nextSeconds = 15; /* seconds between notices*/ 56 57static void alarm_fn(int /*n*/) { 58 printf("%s: still writing\t%s (%ds)\t...\n", wToolname, wOutname, elapsedTime()); 59 60 signal(SIGALRM, &alarm_fn); 61 alarm(nextSeconds); // reset the alarm 62} 63 64static void install_watchdog(const char *toolName, const char *outFileName) { 65 wToolname=toolName; 66 wOutname=outFileName; 67 68 signal(SIGALRM, &alarm_fn); 69 70 alarm(firstSeconds); // set the alarm 71} 72 73#else 74static void install_watchdog(const char*, const char*) { 75 // not implemented 76} 77#endif 78 79 80 81 82U_NAMESPACE_USE 83 84static char *progName; 85static UOption options[]={ 86 UOPTION_HELP_H, /* 0 */ 87 UOPTION_HELP_QUESTION_MARK, /* 1 */ 88 UOPTION_VERBOSE, /* 2 */ 89 UOPTION_ICUDATADIR, /* 4 */ 90 UOPTION_COPYRIGHT, /* 5 */ 91 { "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */ 92 { "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */ 93 { "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */ 94}; 95 96enum arguments { 97 ARG_HELP = 0, 98 ARG_QMARK, 99 ARG_VERBOSE, 100 ARG_ICUDATADIR, 101 ARG_COPYRIGHT, 102 ARG_UCHARS, 103 ARG_BYTES, 104 ARG_TRANSFORM 105}; 106 107// prints out the standard usage method describing command line arguments, 108// then bails out with the desired exit code 109static void usageAndDie(UErrorCode retCode) { 110 fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName); 111 fprintf((U_SUCCESS(retCode) ? stdout : stderr), 112 "\tRead in a word list and write out a string trie dictionary\n" 113 "options:\n" 114 "\t-h or -? or --help this usage text\n" 115 "\t-V or --version show a version message\n" 116 "\t-c or --copyright include a copyright notice\n" 117 "\t-v or --verbose turn on verbose output\n" 118 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option 119 "\t followed by path, defaults to %s\n" 120 "\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n" 121 "\t--bytes output a BytesTrie (mutually exclusive with -u!)\n" 122 "\t--transform the kind of transform to use (eg --transform offset-40A3,\n" 123 "\t which specifies an offset transform with constant 0x40A3)\n", 124 u_getDataDirectory()); 125 exit(retCode); 126} 127 128 129/* UDataInfo cf. udata.h */ 130static UDataInfo dataInfo = { 131 sizeof(UDataInfo), 132 0, 133 134 U_IS_BIG_ENDIAN, 135 U_CHARSET_FAMILY, 136 U_SIZEOF_UCHAR, 137 0, 138 139 { 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */ 140 { 1, 0, 0, 0 }, /* format version */ 141 { 0, 0, 0, 0 } /* data version */ 142}; 143 144#if !UCONFIG_NO_BREAK_ITERATION 145 146// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder. 147// may want to put this somewhere in ICU, as it could be useful outside 148// of this tool? 149class DataDict { 150private: 151 BytesTrieBuilder *bt; 152 UCharsTrieBuilder *ut; 153 UChar32 transformConstant; 154 int32_t transformType; 155public: 156 // constructs a new data dictionary. if there is an error, 157 // it will be returned in status 158 // isBytesTrie != 0 will produce a BytesTrieBuilder, 159 // isBytesTrie == 0 will produce a UCharsTrieBuilder 160 DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL), 161 transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) { 162 if (isBytesTrie) { 163 bt = new BytesTrieBuilder(status); 164 } else { 165 ut = new UCharsTrieBuilder(status); 166 } 167 } 168 169 ~DataDict() { 170 delete bt; 171 delete ut; 172 } 173 174private: 175 char transform(UChar32 c, UErrorCode &status) { 176 if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) { 177 if (c == 0x200D) { return (char)0xFF; } 178 else if (c == 0x200C) { return (char)0xFE; } 179 int32_t delta = c - transformConstant; 180 if (delta < 0 || 0xFD < delta) { 181 fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n", 182 (long)c, (long)transformConstant); 183 exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number 184 } 185 return (char)delta; 186 } else { // no such transform type 187 status = U_INTERNAL_PROGRAM_ERROR; 188 return (char)c; // it should be noted this transform type will not generally work 189 } 190 } 191 192 void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) { 193 UChar32 c = 0; 194 int32_t len = word.length(); 195 for (int32_t i = 0; i < len; i += U16_LENGTH(c)) { 196 c = word.char32At(i); 197 buf.append(transform(c, errorCode), errorCode); 198 } 199 } 200 201public: 202 // sets the desired transformation data. 203 // should be populated from a command line argument 204 // so far the only acceptable format is offset-<hex constant> 205 // eventually others (mask-<hex constant>?) may be enabled 206 // more complex functions may be more difficult 207 void setTransform(const char *t) { 208 if (strncmp(t, "offset-", 7) == 0) { 209 char *end; 210 unsigned long base = uprv_strtoul(t + 7, &end, 16); 211 if (end == (t + 7) || *end != 0 || base > 0x10FF80) { 212 fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7); 213 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 214 } 215 transformType = DictionaryData::TRANSFORM_TYPE_OFFSET; 216 transformConstant = (UChar32)base; 217 } 218 else { 219 fprintf(stderr, "Invalid transform specified: %s\n", t); 220 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 221 } 222 } 223 224 // add a word to the trie 225 void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) { 226 if (bt) { 227 CharString buf; 228 transform(word, buf, status); 229 bt->add(buf.toStringPiece(), value, status); 230 } 231 if (ut) { ut->add(word, value, status); } 232 } 233 234 // if we are a bytestrie, give back the StringPiece representing the serialized version of us 235 StringPiece serializeBytes(UErrorCode &status) { 236 return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status); 237 } 238 239 // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us 240 void serializeUChars(UnicodeString &s, UErrorCode &status) { 241 ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status); 242 } 243 244 int32_t getTransform() { 245 return (int32_t)(transformType | transformConstant); 246 } 247}; 248#endif 249 250static const UChar LINEFEED_CHARACTER = 0x000A; 251static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D; 252 253static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) { 254 int32_t lineLength; 255 const UChar *line = ucbuf_readline(f, &lineLength, errorCode); 256 if(line == NULL || errorCode.isFailure()) { return FALSE; } 257 // Strip trailing CR/LF, comments, and spaces. 258 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' 259 if(comment != NULL) { 260 lineLength = (int32_t)(comment - line); 261 } else { 262 while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; } 263 } 264 while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; } 265 fileLine.setTo(FALSE, line, lineLength); 266 return TRUE; 267} 268 269//---------------------------------------------------------------------------- 270// 271// main for gendict 272// 273//---------------------------------------------------------------------------- 274int main(int argc, char **argv) { 275 // 276 // Pick up and check the command line arguments, 277 // using the standard ICU tool utils option handling. 278 // 279 U_MAIN_INIT_ARGS(argc, argv); 280 progName = argv[0]; 281 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 282 if(argc<0) { 283 // Unrecognized option 284 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 285 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 286 } 287 288 if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { 289 // -? or -h for help. 290 usageAndDie(U_ZERO_ERROR); 291 } 292 293 UBool verbose = options[ARG_VERBOSE].doesOccur; 294 295 if (argc < 3) { 296 fprintf(stderr, "input and output file must both be specified.\n"); 297 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 298 } 299 const char *outFileName = argv[2]; 300 const char *wordFileName = argv[1]; 301 302 startTime = uprv_getRawUTCtime(); // initialize start timer 303 // set up the watchdog 304 install_watchdog(progName, outFileName); 305 306 if (options[ARG_ICUDATADIR].doesOccur) { 307 u_setDataDirectory(options[ARG_ICUDATADIR].value); 308 } 309 310 const char *copyright = NULL; 311 if (options[ARG_COPYRIGHT].doesOccur) { 312 copyright = U_COPYRIGHT_STRING; 313 } 314 315 if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) { 316 fprintf(stderr, "you must specify exactly one type of trie to output!\n"); 317 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 318 } 319 UBool isBytesTrie = options[ARG_BYTES].doesOccur; 320 if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) { 321 fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n"); 322 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 323 } 324 325 IcuToolErrorCode status("gendict/main()"); 326 327#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 328 const char* outDir=NULL; 329 330 UNewDataMemory *pData; 331 char msg[1024]; 332 UErrorCode tempstatus = U_ZERO_ERROR; 333 334 /* write message with just the name */ // potential for a buffer overflow here... 335 sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 336 fprintf(stderr, "%s\n", msg); 337 338 /* write the dummy data file */ 339 pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus); 340 udata_writeBlock(pData, msg, strlen(msg)); 341 udata_finish(pData, &tempstatus); 342 return (int)tempstatus; 343 344#else 345 // Read in the dictionary source file 346 if (verbose) { printf("Opening file %s...\n", wordFileName); } 347 const char *codepage = "UTF-8"; 348 UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status); 349 if (status.isFailure()) { 350 fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName()); 351 exit(status.reset()); 352 } 353 if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); } 354 DataDict dict(isBytesTrie, status); 355 if (status.isFailure()) { 356 fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName()); 357 exit(status.reset()); 358 } 359 if (options[ARG_TRANSFORM].doesOccur) { 360 dict.setTransform(options[ARG_TRANSFORM].value); 361 } 362 363 UnicodeString fileLine; 364 if (verbose) { puts("Adding words to dictionary..."); } 365 UBool hasValues = FALSE; 366 UBool hasValuelessContents = FALSE; 367 int lineCount = 0; 368 int wordCount = 0; 369 int minlen = 255; 370 int maxlen = 0; 371 UBool isOk = TRUE; 372 while (readLine(f, fileLine, status)) { 373 lineCount++; 374 if (fileLine.isEmpty()) continue; 375 376 // Parse word [spaces value]. 377 int32_t keyLen; 378 for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {} 379 if (keyLen == 0) { 380 fprintf(stderr, "Error: no word on line %i!\n", lineCount); 381 isOk = FALSE; 382 continue; 383 } 384 int32_t valueStart; 385 for (valueStart = keyLen; 386 valueStart < fileLine.length() && u_isspace(fileLine[valueStart]); 387 ++valueStart) {} 388 389 if (keyLen < valueStart) { 390 int32_t valueLength = fileLine.length() - valueStart; 391 if (valueLength > 15) { 392 fprintf(stderr, "Error: value too long on line %i!\n", lineCount); 393 isOk = FALSE; 394 continue; 395 } 396 char s[16]; 397 fileLine.extract(valueStart, valueLength, s, 16, US_INV); 398 char *end; 399 unsigned long value = uprv_strtoul(s, &end, 0); 400 if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) { 401 fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount); 402 isOk = FALSE; 403 continue; 404 } 405 dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status); 406 hasValues = TRUE; 407 wordCount++; 408 if (keyLen < minlen) minlen = keyLen; 409 if (keyLen > maxlen) maxlen = keyLen; 410 } else { 411 dict.addWord(fileLine.tempSubString(0, keyLen), 0, status); 412 hasValuelessContents = TRUE; 413 wordCount++; 414 if (keyLen < minlen) minlen = keyLen; 415 if (keyLen > maxlen) maxlen = keyLen; 416 } 417 418 if (status.isFailure()) { 419 fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n", 420 status.errorName(), lineCount); 421 exit(status.reset()); 422 } 423 } 424 if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); } 425 426 if (!isOk && status.isSuccess()) { 427 status.set(U_ILLEGAL_ARGUMENT_ERROR); 428 } 429 if (hasValues && hasValuelessContents) { 430 fprintf(stderr, "warning: file contained both valued and unvalued strings!\n"); 431 } 432 433 if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); } 434 int32_t outDataSize; 435 const void *outData; 436 UnicodeString usp; 437 if (isBytesTrie) { 438 StringPiece sp = dict.serializeBytes(status); 439 outDataSize = sp.size(); 440 outData = sp.data(); 441 } else { 442 dict.serializeUChars(usp, status); 443 outDataSize = usp.length() * U_SIZEOF_UCHAR; 444 outData = usp.getBuffer(); 445 } 446 if (status.isFailure()) { 447 fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName()); 448 exit(status.reset()); 449 } 450 if (verbose) { puts("Opening output file..."); } 451 UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status); 452 if (status.isFailure()) { 453 fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName()); 454 exit(status.reset()); 455 } 456 457 if (verbose) { puts("Writing to output file..."); } 458 int32_t indexes[DictionaryData::IX_COUNT] = { 459 DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0 460 }; 461 int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 462 indexes[DictionaryData::IX_RESERVED1_OFFSET] = size; 463 indexes[DictionaryData::IX_RESERVED2_OFFSET] = size; 464 indexes[DictionaryData::IX_TOTAL_SIZE] = size; 465 466 indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS; 467 if (hasValues) { 468 indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES; 469 } 470 471 indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform(); 472 udata_writeBlock(pData, indexes, sizeof(indexes)); 473 udata_writeBlock(pData, outData, outDataSize); 474 size_t bytesWritten = udata_finish(pData, status); 475 if (status.isFailure()) { 476 fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName()); 477 exit(status.reset()); 478 } 479 480 if (bytesWritten != (size_t)size) { 481 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 482 exit(U_INTERNAL_PROGRAM_ERROR); 483 } 484 485 printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); 486 487#ifdef TEST_GENDICT 488 if (isBytesTrie) { 489 BytesTrie::Iterator it(outData, outDataSize, status); 490 while (it.hasNext()) { 491 it.next(status); 492 const StringPiece s = it.getString(); 493 int32_t val = it.getValue(); 494 printf("%s -> %i\n", s.data(), val); 495 } 496 } else { 497 UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status); 498 while (it.hasNext()) { 499 it.next(status); 500 const UnicodeString s = it.getString(); 501 int32_t val = it.getValue(); 502 char tmp[1024]; 503 s.extract(0, s.length(), tmp, 1024); 504 printf("%s -> %i\n", tmp, val); 505 } 506 } 507#endif 508 509 return 0; 510#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 511} 512