18cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd/* 28cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd********************************************************************** 38cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd* Copyright (C) 2002-2013, International Business Machines 48cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd* Corporation and others. All Rights Reserved. 58cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd********************************************************************** 68cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd* 78cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd* File gendict.cpp 88cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd*/ 98cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 108cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/utypes.h" 118cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/uchar.h" 128cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/ucnv.h" 138cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/uniset.h" 148cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/unistr.h" 158cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/uclean.h" 168cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/udata.h" 178cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/putil.h" 188cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/ucharstriebuilder.h" 198cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/bytestriebuilder.h" 208cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/ucharstrie.h" 218cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/bytestrie.h" 228cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/ucnv.h" 238cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unicode/utf16.h" 248cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 258cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "charstr.h" 268cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "dictionarydata.h" 278cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "uoptions.h" 288cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "unewdata.h" 298cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "cmemory.h" 308cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "uassert.h" 318cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "ucbuf.h" 328cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "toolutil.h" 338cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "cstring.h" 348cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 358cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include <stdio.h> 368cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include <stdlib.h> 378cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include <string.h> 388cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 398cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include "putilimp.h" 408cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike DoddUDate startTime; 418cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 428cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic int elapsedTime() { 438cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0); 448cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 458cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 468cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API 478cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 488cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include <signal.h> 498cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#include <unistd.h> 508cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 518cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddconst char *wToolname="gendict"; 528cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddconst char *wOutname="(some file)"; 538cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 548cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddconst int firstSeconds = 5; /* seconds between notices*/ 558cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddconst int nextSeconds = 15; /* seconds between notices*/ 568cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 578cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic void alarm_fn(int /*n*/) { 588cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd printf("%s: still writing\t%s (%ds)\t...\n", wToolname, wOutname, elapsedTime()); 598cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 608cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd signal(SIGALRM, &alarm_fn); 618cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd alarm(nextSeconds); // reset the alarm 628cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 638cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 648cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic void install_watchdog(const char *toolName, const char *outFileName) { 658cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd wToolname=toolName; 668cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd wOutname=outFileName; 678cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 688cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd signal(SIGALRM, &alarm_fn); 698cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 708cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd alarm(firstSeconds); // set the alarm 718cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 728cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 738cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#else 748cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic void install_watchdog(const char*, const char*) { 758cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // not implemented 768cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 778cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#endif 788cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 798cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 808cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 818cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 828cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike DoddU_NAMESPACE_USE 838cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 848cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic char *progName; 858cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic UOption options[]={ 868cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UOPTION_HELP_H, /* 0 */ 878cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UOPTION_HELP_QUESTION_MARK, /* 1 */ 888cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UOPTION_VERBOSE, /* 2 */ 898cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UOPTION_ICUDATADIR, /* 4 */ 908cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UOPTION_COPYRIGHT, /* 5 */ 918cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd { "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */ 928cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd { "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */ 938cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd { "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */ 948cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd}; 958cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 968cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddenum arguments { 978cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_HELP = 0, 988cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_QMARK, 998cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_VERBOSE, 1008cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_ICUDATADIR, 1018cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_COPYRIGHT, 1028cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_UCHARS, 1038cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_BYTES, 1048cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ARG_TRANSFORM 1058cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd}; 1068cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1078cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// prints out the standard usage method describing command line arguments, 1088cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// then bails out with the desired exit code 1098cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic void usageAndDie(UErrorCode retCode) { 1108cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName); 1118cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf((U_SUCCESS(retCode) ? stdout : stderr), 1128cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\tRead in a word list and write out a string trie dictionary\n" 1138cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "options:\n" 1148cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t-h or -? or --help this usage text\n" 1158cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t-V or --version show a version message\n" 1168cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t-c or --copyright include a copyright notice\n" 1178cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t-v or --verbose turn on verbose output\n" 1188cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option 1198cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t followed by path, defaults to %s\n" 1208cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n" 1218cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t--bytes output a BytesTrie (mutually exclusive with -u!)\n" 1228cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t--transform the kind of transform to use (eg --transform offset-40A3,\n" 1238cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd "\t which specifies an offset transform with constant 0x40A3)\n", 1248cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd u_getDataDirectory()); 1258cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(retCode); 1268cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 1278cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1288cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1298cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd/* UDataInfo cf. udata.h */ 1308cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic UDataInfo dataInfo = { 1318cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd sizeof(UDataInfo), 1328cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 0, 1338cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1348cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd U_IS_BIG_ENDIAN, 1358cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd U_CHARSET_FAMILY, 1368cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd U_SIZEOF_UCHAR, 1378cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 0, 1388cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1398cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd { 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */ 1408cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd { 1, 0, 0, 0 }, /* format version */ 1418cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd { 0, 0, 0, 0 } /* data version */ 1428cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd}; 1438cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1448cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#if !UCONFIG_NO_BREAK_ITERATION 1458cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1468cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder. 1478cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// may want to put this somewhere in ICU, as it could be useful outside 1488cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// of this tool? 1498cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddclass DataDict { 1508cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddprivate: 1518cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd BytesTrieBuilder *bt; 1528cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UCharsTrieBuilder *ut; 1538cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UChar32 transformConstant; 1548cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t transformType; 1558cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddpublic: 1568cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // constructs a new data dictionary. if there is an error, 1578cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // it will be returned in status 1588cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // isBytesTrie != 0 will produce a BytesTrieBuilder, 1598cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // isBytesTrie == 0 will produce a UCharsTrieBuilder 1608cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL), 1618cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) { 1628cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (isBytesTrie) { 1638cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd bt = new BytesTrieBuilder(status); 1648cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } else { 1658cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ut = new UCharsTrieBuilder(status); 1668cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1678cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1688cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1698cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ~DataDict() { 1708cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd delete bt; 1718cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd delete ut; 1728cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1738cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1748cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddprivate: 1758cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd char transform(UChar32 c, UErrorCode &status) { 1768cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) { 1778cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (c == 0x200D) { return (char)0xFF; } 1788cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd else if (c == 0x200C) { return (char)0xFE; } 1798cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t delta = c - transformConstant; 1808cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (delta < 0 || 0xFD < delta) { 1818cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n", 1828cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd (long)c, (long)transformConstant); 1838cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number 1848cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1858cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return (char)delta; 1868cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } else { // no such transform type 1878cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd status = U_INTERNAL_PROGRAM_ERROR; 1888cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return (char)c; // it should be noted this transform type will not generally work 1898cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1908cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1918cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 1928cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) { 1938cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UChar32 c = 0; 1948cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t len = word.length(); 1958cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd for (int32_t i = 0; i < len; i += U16_LENGTH(c)) { 1968cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd c = word.char32At(i); 1978cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd buf.append(transform(c, errorCode), errorCode); 1988cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 1998cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2008cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2018cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddpublic: 2028cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // sets the desired transformation data. 2038cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // should be populated from a command line argument 2048cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // so far the only acceptable format is offset-<hex constant> 2058cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // eventually others (mask-<hex constant>?) may be enabled 2068cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // more complex functions may be more difficult 2078cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd void setTransform(const char *t) { 2088cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (strncmp(t, "offset-", 7) == 0) { 2098cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd char *end; 2108cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd unsigned long base = uprv_strtoul(t + 7, &end, 16); 2118cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (end == (t + 7) || *end != 0 || base > 0x10FF80) { 2128cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7); 2138cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 2148cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2158cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd transformType = DictionaryData::TRANSFORM_TYPE_OFFSET; 2168cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd transformConstant = (UChar32)base; 2178cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2188cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd else { 2198cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Invalid transform specified: %s\n", t); 2208cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 2218cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2228cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2238cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2248cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // add a word to the trie 2258cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) { 2268cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (bt) { 2278cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd CharString buf; 2288cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd transform(word, buf, status); 2298cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd bt->add(buf.toStringPiece(), value, status); 2308cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2318cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (ut) { ut->add(word, value, status); } 2328cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2338cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2348cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // if we are a bytestrie, give back the StringPiece representing the serialized version of us 2358cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd StringPiece serializeBytes(UErrorCode &status) { 2368cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status); 2378cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2388cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2398cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us 2408cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd void serializeUChars(UnicodeString &s, UErrorCode &status) { 2418cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status); 2428cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2438cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2448cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t getTransform() { 2458cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return (int32_t)(transformType | transformConstant); 2468cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2478cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd}; 2488cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#endif 2498cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2508cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic const UChar LINEFEED_CHARACTER = 0x000A; 2518cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic const UChar CARRIAGE_RETURN_CHARACTER = 0x000D; 2528cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2538cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddstatic UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) { 2548cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t lineLength; 2558cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const UChar *line = ucbuf_readline(f, &lineLength, errorCode); 2568cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if(line == NULL || errorCode.isFailure()) { return FALSE; } 2578cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // Strip trailing CR/LF, comments, and spaces. 2588cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' 2598cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if(comment != NULL) { 2608cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd lineLength = (int32_t)(comment - line); 2618cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } else { 2628cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; } 2638cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2648cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; } 2658cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fileLine.setTo(FALSE, line, lineLength); 2668cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return TRUE; 2678cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 2688cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2698cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd//---------------------------------------------------------------------------- 2708cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// 2718cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// main for gendict 2728cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd// 2738cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd//---------------------------------------------------------------------------- 2748cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Doddint main(int argc, char **argv) { 2758cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // 2768cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // Pick up and check the command line arguments, 2778cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // using the standard ICU tool utils option handling. 2788cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // 2798cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd U_MAIN_INIT_ARGS(argc, argv); 2808cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd progName = argv[0]; 2818cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 2828cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if(argc<0) { 2838cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // Unrecognized option 2848cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 2858cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 2868cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2878cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2888cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { 2898cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // -? or -h for help. 2908cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ZERO_ERROR); 2918cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2928cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2938cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UBool verbose = options[ARG_VERBOSE].doesOccur; 2948cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 2958cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (argc < 3) { 2968cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "input and output file must both be specified.\n"); 2978cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 2988cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 2998cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const char *outFileName = argv[2]; 3008cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const char *wordFileName = argv[1]; 3018cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3028cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd startTime = uprv_getRawUTCtime(); // initialize start timer 3038cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // set up the watchdog 3048cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd install_watchdog(progName, outFileName); 3058cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3068cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (options[ARG_ICUDATADIR].doesOccur) { 3078cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd u_setDataDirectory(options[ARG_ICUDATADIR].value); 3088cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3098cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3108cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const char *copyright = NULL; 3118cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (options[ARG_COPYRIGHT].doesOccur) { 3128cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd copyright = U_COPYRIGHT_STRING; 3138cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3148cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3158cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) { 3168cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "you must specify exactly one type of trie to output!\n"); 3178cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 3188cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3198cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UBool isBytesTrie = options[ARG_BYTES].doesOccur; 3208cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) { 3218cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n"); 3228cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 3238cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3248cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3258cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd IcuToolErrorCode status("gendict/main()"); 3268cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3278cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 3288cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const char* outDir=NULL; 3298cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3308cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UNewDataMemory *pData; 3318cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd char msg[1024]; 3328cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UErrorCode tempstatus = U_ZERO_ERROR; 3338cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3348cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd /* write message with just the name */ // potential for a buffer overflow here... 3358cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 3368cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "%s\n", msg); 3378cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3388cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd /* write the dummy data file */ 3398cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus); 3408cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd udata_writeBlock(pData, msg, strlen(msg)); 3418cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd udata_finish(pData, &tempstatus); 3428cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return (int)tempstatus; 3438cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3448cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#else 3458cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // Read in the dictionary source file 3468cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { printf("Opening file %s...\n", wordFileName); } 3478cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const char *codepage = "UTF-8"; 3488cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status); 3498cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (status.isFailure()) { 3508cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName()); 3518cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(status.reset()); 3528cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3538cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); } 3548cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd DataDict dict(isBytesTrie, status); 3558cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (status.isFailure()) { 3568cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName()); 3578cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(status.reset()); 3588cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3598cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (options[ARG_TRANSFORM].doesOccur) { 3608cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd dict.setTransform(options[ARG_TRANSFORM].value); 3618cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3628cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3638cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UnicodeString fileLine; 3648cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { puts("Adding words to dictionary..."); } 3658cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UBool hasValues = FALSE; 3668cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UBool hasValuelessContents = FALSE; 3678cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int lineCount = 0; 3688cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int wordCount = 0; 3698cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int minlen = 255; 3708cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int maxlen = 0; 3718cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UBool isOk = TRUE; 3728cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd while (readLine(f, fileLine, status)) { 3738cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd lineCount++; 3748cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (fileLine.isEmpty()) continue; 3758cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3768cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd // Parse word [spaces value]. 3778cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t keyLen; 3788cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {} 3798cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (keyLen == 0) { 3808cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Error: no word on line %i!\n", lineCount); 3818cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd isOk = FALSE; 3828cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd continue; 3838cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3848cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t valueStart; 3858cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd for (valueStart = keyLen; 3868cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd valueStart < fileLine.length() && u_isspace(fileLine[valueStart]); 3878cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd ++valueStart) {} 3888cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 3898cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (keyLen < valueStart) { 3908cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t valueLength = fileLine.length() - valueStart; 3918cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (valueLength > 15) { 3928cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Error: value too long on line %i!\n", lineCount); 3938cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd isOk = FALSE; 3948cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd continue; 3958cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 3968cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd char s[16]; 3978cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fileLine.extract(valueStart, valueLength, s, 16, US_INV); 3988cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd char *end; 3998cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd unsigned long value = uprv_strtoul(s, &end, 0); 4008cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) { 4018cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount); 4028cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd isOk = FALSE; 4038cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd continue; 4048cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4058cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status); 4068cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd hasValues = TRUE; 4078cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd wordCount++; 4088cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (keyLen < minlen) minlen = keyLen; 4098cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (keyLen > maxlen) maxlen = keyLen; 4108cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } else { 4118cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd dict.addWord(fileLine.tempSubString(0, keyLen), 0, status); 4128cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd hasValuelessContents = TRUE; 4138cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd wordCount++; 4148cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (keyLen < minlen) minlen = keyLen; 4158cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (keyLen > maxlen) maxlen = keyLen; 4168cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4178cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4188cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (status.isFailure()) { 4198cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n", 4208cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd status.errorName(), lineCount); 4218cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(status.reset()); 4228cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4238cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4248cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); } 4258cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4268cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (!isOk && status.isSuccess()) { 4278cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd status.set(U_ILLEGAL_ARGUMENT_ERROR); 4288cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4298cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (hasValues && hasValuelessContents) { 4308cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "warning: file contained both valued and unvalued strings!\n"); 4318cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4328cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4338cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); } 4348cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t outDataSize; 4358cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const void *outData; 4368cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UnicodeString usp; 4378cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (isBytesTrie) { 4388cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd StringPiece sp = dict.serializeBytes(status); 4398cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd outDataSize = sp.size(); 4408cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd outData = sp.data(); 4418cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } else { 4428cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd dict.serializeUChars(usp, status); 4438cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd outDataSize = usp.length() * U_SIZEOF_UCHAR; 4448cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd outData = usp.getBuffer(); 4458cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4468cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (status.isFailure()) { 4478cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName()); 4488cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(status.reset()); 4498cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4508cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { puts("Opening output file..."); } 4518cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status); 4528cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (status.isFailure()) { 4538cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName()); 4548cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(status.reset()); 4558cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4568cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4578cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (verbose) { puts("Writing to output file..."); } 4588cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t indexes[DictionaryData::IX_COUNT] = { 4598cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0 4608cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd }; 4618cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 4628cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd indexes[DictionaryData::IX_RESERVED1_OFFSET] = size; 4638cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd indexes[DictionaryData::IX_RESERVED2_OFFSET] = size; 4648cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd indexes[DictionaryData::IX_TOTAL_SIZE] = size; 4658cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4668cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS; 4678cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (hasValues) { 4688cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES; 4698cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4708cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4718cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform(); 4728cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd udata_writeBlock(pData, indexes, sizeof(indexes)); 4738cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd udata_writeBlock(pData, outData, outDataSize); 4748cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd size_t bytesWritten = udata_finish(pData, status); 4758cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (status.isFailure()) { 4768cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName()); 4778cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(status.reset()); 4788cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4798cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4808cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (bytesWritten != (size_t)size) { 4818cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 4828cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd exit(U_INTERNAL_PROGRAM_ERROR); 4838cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4848cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4858cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); 4868cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 4878cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#ifdef TEST_GENDICT 4888cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd if (isBytesTrie) { 4898cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd BytesTrie::Iterator it(outData, outDataSize, status); 4908cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd while (it.hasNext()) { 4918cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd it.next(status); 4928cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const StringPiece s = it.getString(); 4938cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t val = it.getValue(); 4948cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd printf("%s -> %i\n", s.data(), val); 4958cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 4968cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } else { 4978cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status); 4988cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd while (it.hasNext()) { 4998cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd it.next(status); 5008cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd const UnicodeString s = it.getString(); 5018cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd int32_t val = it.getValue(); 5028cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd char tmp[1024]; 5038cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd s.extract(0, s.length(), tmp, 1024); 5048cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd printf("%s -> %i\n", tmp, val); 5058cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 5068cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd } 5078cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#endif 5088cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd 5098cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd return 0; 5108cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 5118cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd} 5128cfa702f803c5ef6a2b062a489a1b2cf66b45b5eMike Dodd