1/***************************************************************************** 2* 3* Copyright (C) 1999-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5* 6******************************************************************************/ 7 8/* 9 * uconv(1): an iconv(1)-like converter using ICU. 10 * 11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se> 12 * contributed in 1999. 13 * 14 * Conversion to the C conversion API and many improvements by 15 * Yves Arrouye <yves@realnames.com>, current maintainer. 16 * 17 * Markus Scherer maintainer from 2003. 18 * See source code repository history for changes. 19 */ 20 21#include <unicode/utypes.h> 22#include <unicode/putil.h> 23#include <unicode/ucnv.h> 24#include <unicode/uenum.h> 25#include <unicode/unistr.h> 26#include <unicode/translit.h> 27#include <unicode/uset.h> 28#include <unicode/uclean.h> 29#include <unicode/utf16.h> 30 31#include <stdio.h> 32#include <errno.h> 33#include <string.h> 34#include <stdlib.h> 35 36#include "cmemory.h" 37#include "cstring.h" 38#include "ustrfmt.h" 39 40#include "unicode/uwmsg.h" 41 42U_NAMESPACE_USE 43 44#if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__) 45#include <io.h> 46#include <fcntl.h> 47#if U_PLATFORM_USES_ONLY_WIN32_API 48#define USE_FILENO_BINARY_MODE 1 49/* Windows likes to rename Unix-like functions */ 50#ifndef fileno 51#define fileno _fileno 52#endif 53#ifndef setmode 54#define setmode _setmode 55#endif 56#ifndef O_BINARY 57#define O_BINARY _O_BINARY 58#endif 59#endif 60#endif 61 62#ifdef UCONVMSG_LINK 63/* below from the README */ 64#include "unicode/utypes.h" 65#include "unicode/udata.h" 66U_CFUNC char uconvmsg_dat[]; 67#endif 68 69#define DEFAULT_BUFSZ 4096 70#define UCONVMSG "uconvmsg" 71 72static UResourceBundle *gBundle = 0; /* Bundle containing messages. */ 73 74/* 75 * Initialize the message bundle so that message strings can be fetched 76 * by u_wmsg(). 77 * 78 */ 79 80static void initMsg(const char *pname) { 81 static int ps = 0; 82 83 if (!ps) { 84 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */ 85 UErrorCode err = U_ZERO_ERROR; 86 87 ps = 1; 88 89 /* Set up our static data - if any */ 90#if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */ 91 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err); 92 if (U_FAILURE(err)) { 93 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n", 94 pname, u_errorName(err)); 95 err = U_ZERO_ERROR; /* It may still fail */ 96 } 97#endif 98 99 /* Get messages. */ 100 gBundle = u_wmsg_setPath(UCONVMSG, &err); 101 if (U_FAILURE(err)) { 102 fprintf(stderr, 103 "%s: warning: couldn't open bundle %s: %s\n", 104 pname, UCONVMSG, u_errorName(err)); 105#ifdef UCONVMSG_LINK 106 fprintf(stderr, 107 "%s: setAppData was called, internal data %s failed to load\n", 108 pname, UCONVMSG); 109#endif 110 111 err = U_ZERO_ERROR; 112 /* that was try #1, try again with a path */ 113 uprv_strcpy(dataPath, u_getDataDirectory()); 114 uprv_strcat(dataPath, U_FILE_SEP_STRING); 115 uprv_strcat(dataPath, UCONVMSG); 116 117 gBundle = u_wmsg_setPath(dataPath, &err); 118 if (U_FAILURE(err)) { 119 fprintf(stderr, 120 "%s: warning: still couldn't open bundle %s: %s\n", 121 pname, dataPath, u_errorName(err)); 122 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname); 123 } 124 } 125 } 126} 127 128/* Mapping of callback names to the callbacks passed to the converter 129 API. */ 130 131static struct callback_ent { 132 const char *name; 133 UConverterFromUCallback fromu; 134 const void *fromuctxt; 135 UConverterToUCallback tou; 136 const void *touctxt; 137} transcode_callbacks[] = { 138 { "substitute", 139 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 140 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 }, 141 { "skip", 142 UCNV_FROM_U_CALLBACK_SKIP, 0, 143 UCNV_TO_U_CALLBACK_SKIP, 0 }, 144 { "stop", 145 UCNV_FROM_U_CALLBACK_STOP, 0, 146 UCNV_TO_U_CALLBACK_STOP, 0 }, 147 { "escape", 148 UCNV_FROM_U_CALLBACK_ESCAPE, 0, 149 UCNV_TO_U_CALLBACK_ESCAPE, 0}, 150 { "escape-icu", 151 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, 152 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU }, 153 { "escape-java", 154 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, 155 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA }, 156 { "escape-c", 157 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, 158 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C }, 159 { "escape-xml", 160 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 161 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 162 { "escape-xml-hex", 163 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 164 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 165 { "escape-xml-dec", 166 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 167 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, 168 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, 169 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE } 170}; 171 172/* Return a pointer to a callback record given its name. */ 173 174static const struct callback_ent *findCallback(const char *name) { 175 int i, count = 176 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 177 178 /* We'll do a linear search, there aren't many of them and bsearch() 179 may not be that portable. */ 180 181 for (i = 0; i < count; ++i) { 182 if (!uprv_stricmp(name, transcode_callbacks[i].name)) { 183 return &transcode_callbacks[i]; 184 } 185 } 186 187 return 0; 188} 189 190/* Print converter information. If lookfor is set, only that converter will 191 be printed, otherwise all converters will be printed. If canon is non 192 zero, tags and aliases for each converter are printed too, in the format 193 expected for convrters.txt(5). */ 194 195static int printConverters(const char *pname, const char *lookfor, 196 UBool canon) 197{ 198 UErrorCode err = U_ZERO_ERROR; 199 int32_t num; 200 uint16_t num_stds; 201 const char **stds; 202 203 /* If there is a specified name, just handle that now. */ 204 205 if (lookfor) { 206 if (!canon) { 207 printf("%s\n", lookfor); 208 return 0; 209 } else { 210 /* Because we are printing a canonical name, we need the 211 true converter name. We've done that already except for 212 the default name (because we want to print the exact 213 name one would get when calling ucnv_getDefaultName() 214 in non-canon mode). But since we do not know at this 215 point if we have the default name or something else, we 216 need to normalize again to the canonical converter 217 name. */ 218 219 const char *truename = ucnv_getAlias(lookfor, 0, &err); 220 if (U_SUCCESS(err)) { 221 lookfor = truename; 222 } else { 223 err = U_ZERO_ERROR; 224 } 225 } 226 } 227 228 /* Print converter names. We come here for one of two reasons: we 229 are printing all the names (lookfor was null), or we have a 230 single converter to print but in canon mode, hence we need to 231 get to it in order to print everything. */ 232 233 num = ucnv_countAvailable(); 234 if (num <= 0) { 235 initMsg(pname); 236 u_wmsg(stderr, "cantGetNames"); 237 return -1; 238 } 239 if (lookfor) { 240 num = 1; /* We know where we want to be. */ 241 } 242 243 num_stds = ucnv_countStandards(); 244 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds)); 245 if (!stds) { 246 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR)); 247 return -1; 248 } else { 249 uint16_t s; 250 251 if (canon) { 252 printf("{ "); 253 } 254 for (s = 0; s < num_stds; ++s) { 255 stds[s] = ucnv_getStandard(s, &err); 256 if (canon) { 257 printf("%s ", stds[s]); 258 } 259 if (U_FAILURE(err)) { 260 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err)); 261 goto error_cleanup; 262 } 263 } 264 if (canon) { 265 puts("}"); 266 } 267 } 268 269 for (int32_t i = 0; i < num; i++) { 270 const char *name; 271 uint16_t num_aliases; 272 273 /* Set the name either to what we are looking for, or 274 to the current converter name. */ 275 276 if (lookfor) { 277 name = lookfor; 278 } else { 279 name = ucnv_getAvailableName(i); 280 } 281 282 /* Get all the aliases associated to the name. */ 283 284 err = U_ZERO_ERROR; 285 num_aliases = ucnv_countAliases(name, &err); 286 if (U_FAILURE(err)) { 287 printf("%s", name); 288 289 UnicodeString str(name, ""); 290 putchar('\t'); 291 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 292 u_wmsg_errorName(err)); 293 goto error_cleanup; 294 } else { 295 uint16_t a, s, t; 296 297 /* Write all the aliases and their tags. */ 298 299 for (a = 0; a < num_aliases; ++a) { 300 const char *alias = ucnv_getAlias(name, a, &err); 301 302 if (U_FAILURE(err)) { 303 UnicodeString str(name, ""); 304 putchar('\t'); 305 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 306 u_wmsg_errorName(err)); 307 goto error_cleanup; 308 } 309 310 /* Print the current alias so that it looks right. */ 311 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") , 312 alias, 313 (canon ? "" : " ")); 314 315 /* Look (slowly, linear searching) for a tag. */ 316 317 if (canon) { 318 /* -1 to skip the last standard */ 319 for (s = t = 0; s < num_stds-1; ++s) { 320 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err); 321 if (U_SUCCESS(err)) { 322 /* List the standard tags */ 323 const char *standardName; 324 UBool isFirst = TRUE; 325 UErrorCode enumError = U_ZERO_ERROR; 326 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) { 327 /* See if this alias is supported by this standard. */ 328 if (!strcmp(standardName, alias)) { 329 if (!t) { 330 printf(" {"); 331 t = 1; 332 } 333 /* Print a * after the default standard name */ 334 printf(" %s%s", stds[s], (isFirst ? "*" : "")); 335 } 336 isFirst = FALSE; 337 } 338 } 339 } 340 if (t) { 341 printf(" }"); 342 } 343 } 344 /* Terminate this entry. */ 345 if (canon) { 346 puts(""); 347 } 348 349 /* Move on. */ 350 } 351 /* Terminate this entry. */ 352 if (!canon) { 353 puts(""); 354 } 355 } 356 } 357 358 /* Free temporary data. */ 359 360 uprv_free(stds); 361 362 /* Success. */ 363 364 return 0; 365error_cleanup: 366 uprv_free(stds); 367 return -1; 368} 369 370/* Print all available transliterators. If canon is non zero, print 371 one transliterator per line. */ 372 373static int printTransliterators(UBool canon) 374{ 375#if UCONFIG_NO_TRANSLITERATION 376 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n"); 377 return 1; 378#else 379 UErrorCode status = U_ZERO_ERROR; 380 UEnumeration *ids = utrans_openIDs(&status); 381 int32_t i, numtrans = uenum_count(ids, &status); 382 383 char sepchar = canon ? '\n' : ' '; 384 385 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) { 386 int32_t len; 387 const char *nextTrans = uenum_next(ids, &len, &status); 388 389 printf("%s", nextTrans); 390 if (i < numtrans - 1) { 391 putchar(sepchar); 392 } 393 } 394 395 uenum_close(ids); 396 397 /* Add a terminating newline if needed. */ 398 399 if (sepchar != '\n') { 400 putchar('\n'); 401 } 402 403 /* Success. */ 404 405 return 0; 406#endif 407} 408 409enum { 410 uSP = 0x20, // space 411 uCR = 0xd, // carriage return 412 uLF = 0xa, // line feed 413 uNL = 0x85, // newline 414 uLS = 0x2028, // line separator 415 uPS = 0x2029, // paragraph separator 416 uSig = 0xfeff // signature/BOM character 417}; 418 419static inline int32_t 420getChunkLimit(const UnicodeString &prev, const UnicodeString &s) { 421 // find one of 422 // CR, LF, CRLF, NL, LS, PS 423 // for paragraph ends (see UAX #13/Unicode 4) 424 // and include it in the chunk 425 // all of these characters are on the BMP 426 // do not include FF or VT in case they are part of a paragraph 427 // (important for bidi contexts) 428 static const UChar paraEnds[] = { 429 0xd, 0xa, 0x85, 0x2028, 0x2029 430 }; 431 enum { 432 iCR, iLF, iNL, iLS, iPS, iCount 433 }; 434 435 // first, see if there is a CRLF split between prev and s 436 if (prev.endsWith(paraEnds + iCR, 1)) { 437 if (s.startsWith(paraEnds + iLF, 1)) { 438 return 1; // split CRLF, include the LF 439 } else if (!s.isEmpty()) { 440 return 0; // complete the last chunk 441 } else { 442 return -1; // wait for actual further contents to arrive 443 } 444 } 445 446 const UChar *u = s.getBuffer(), *limit = u + s.length(); 447 UChar c; 448 449 while (u < limit) { 450 c = *u++; 451 if ( 452 ((c < uSP) && (c == uCR || c == uLF)) || 453 (c == uNL) || 454 ((c & uLS) == uLS) 455 ) { 456 if (c == uCR) { 457 // check for CRLF 458 if (u == limit) { 459 return -1; // LF may be in the next chunk 460 } else if (*u == uLF) { 461 ++u; // include the LF in this chunk 462 } 463 } 464 return (int32_t)(u - s.getBuffer()); 465 } 466 } 467 468 return -1; // continue collecting the chunk 469} 470 471enum { 472 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM) 473 CNV_WITH_FEFF, // can convert the U+FEFF signature character 474 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character 475}; 476 477static inline UChar 478nibbleToHex(uint8_t n) { 479 n &= 0xf; 480 return 481 n <= 9 ? 482 (UChar)(0x30 + n) : 483 (UChar)((0x61 - 10) + n); 484} 485 486// check the converter's Unicode signature properties; 487// the fromUnicode side of the converter must be in its initial state 488// and will be reset again if it was used 489static int32_t 490cnvSigType(UConverter *cnv) { 491 UErrorCode err; 492 int32_t result; 493 494 // test if the output charset can convert U+FEFF 495 USet *set = uset_open(1, 0); 496 err = U_ZERO_ERROR; 497 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err); 498 if (U_SUCCESS(err) && uset_contains(set, uSig)) { 499 result = CNV_WITH_FEFF; 500 } else { 501 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted 502 } 503 uset_close(set); 504 505 if (result == CNV_WITH_FEFF) { 506 // test if the output charset emits a signature anyway 507 const UChar a[1] = { 0x61 }; // "a" 508 const UChar *in; 509 510 char buffer[20]; 511 char *out; 512 513 in = a; 514 out = buffer; 515 err = U_ZERO_ERROR; 516 ucnv_fromUnicode(cnv, 517 &out, buffer + sizeof(buffer), 518 &in, a + 1, 519 NULL, TRUE, &err); 520 ucnv_resetFromUnicode(cnv); 521 522 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) && 523 U_SUCCESS(err) 524 ) { 525 result = CNV_ADDS_FEFF; 526 } 527 } 528 529 return result; 530} 531 532class ConvertFile { 533public: 534 ConvertFile() : 535 buf(NULL), outbuf(NULL), fromoffsets(NULL), 536 bufsz(0), signature(0) {} 537 538 void 539 setBufferSize(size_t bufferSize) { 540 bufsz = bufferSize; 541 542 buf = new char[2 * bufsz]; 543 outbuf = buf + bufsz; 544 545 // +1 for an added U+FEFF in the intermediate Unicode buffer 546 fromoffsets = new int32_t[bufsz + 1]; 547 } 548 549 ~ConvertFile() { 550 delete [] buf; 551 delete [] fromoffsets; 552 } 553 554 UBool convertFile(const char *pname, 555 const char *fromcpage, 556 UConverterToUCallback toucallback, 557 const void *touctxt, 558 const char *tocpage, 559 UConverterFromUCallback fromucallback, 560 const void *fromuctxt, 561 UBool fallback, 562 const char *translit, 563 const char *infilestr, 564 FILE * outfile, int verbose); 565private: 566 friend int main(int argc, char **argv); 567 568 char *buf, *outbuf; 569 int32_t *fromoffsets; 570 571 size_t bufsz; 572 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character 573}; 574 575// Convert a file from one encoding to another 576UBool 577ConvertFile::convertFile(const char *pname, 578 const char *fromcpage, 579 UConverterToUCallback toucallback, 580 const void *touctxt, 581 const char *tocpage, 582 UConverterFromUCallback fromucallback, 583 const void *fromuctxt, 584 UBool fallback, 585 const char *translit, 586 const char *infilestr, 587 FILE * outfile, int verbose) 588{ 589 FILE *infile; 590 UBool ret = TRUE; 591 UConverter *convfrom = 0; 592 UConverter *convto = 0; 593 UErrorCode err = U_ZERO_ERROR; 594 UBool flush; 595 UBool closeFile = FALSE; 596 const char *cbufp, *prevbufp; 597 char *bufp; 598 599 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ 600 601 const UChar *unibuf, *unibufbp; 602 UChar *unibufp; 603 604 size_t rd, wr; 605 606#if !UCONFIG_NO_TRANSLITERATION 607 Transliterator *t = 0; // Transliterator acting on Unicode data. 608 UnicodeString chunk; // One chunk of the text being collected for transformation. 609#endif 610 UnicodeString u; // String to do the transliteration. 611 int32_t ulen; 612 613 // use conversion offsets for error messages 614 // unless a transliterator is used - 615 // a text transformation will reorder characters in unpredictable ways 616 UBool useOffsets = TRUE; 617 618 // Open the correct input file or connect to stdin for reading input 619 620 if (infilestr != 0 && strcmp(infilestr, "-")) { 621 infile = fopen(infilestr, "rb"); 622 if (infile == 0) { 623 UnicodeString str1(infilestr, ""); 624 str1.append((UChar32) 0); 625 UnicodeString str2(strerror(errno), ""); 626 str2.append((UChar32) 0); 627 initMsg(pname); 628 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); 629 return FALSE; 630 } 631 closeFile = TRUE; 632 } else { 633 infilestr = "-"; 634 infile = stdin; 635#ifdef USE_FILENO_BINARY_MODE 636 if (setmode(fileno(stdin), O_BINARY) == -1) { 637 initMsg(pname); 638 u_wmsg(stderr, "cantSetInBinMode"); 639 return FALSE; 640 } 641#endif 642 } 643 644 if (verbose) { 645 fprintf(stderr, "%s:\n", infilestr); 646 } 647 648#if !UCONFIG_NO_TRANSLITERATION 649 // Create transliterator as needed. 650 651 if (translit != NULL && *translit) { 652 UParseError parse; 653 UnicodeString str(translit), pestr; 654 655 /* Create from rules or by ID as needed. */ 656 657 parse.line = -1; 658 659 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { 660 t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err); 661 } else { 662 t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err); 663 } 664 665 if (U_FAILURE(err)) { 666 str.append((UChar32) 0); 667 initMsg(pname); 668 669 if (parse.line >= 0) { 670 UChar linebuf[20], offsetbuf[20]; 671 uprv_itou(linebuf, 20, parse.line, 10, 0); 672 uprv_itou(offsetbuf, 20, parse.offset, 10, 0); 673 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(), 674 u_wmsg_errorName(err), linebuf, offsetbuf); 675 } else { 676 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(), 677 u_wmsg_errorName(err)); 678 } 679 680 if (t) { 681 delete t; 682 t = 0; 683 } 684 goto error_exit; 685 } 686 687 useOffsets = FALSE; 688 } 689#endif 690 691 // Create codepage converter. If the codepage or its aliases weren't 692 // available, it returns NULL and a failure code. We also set the 693 // callbacks, and return errors in the same way. 694 695 convfrom = ucnv_open(fromcpage, &err); 696 if (U_FAILURE(err)) { 697 UnicodeString str(fromcpage, ""); 698 initMsg(pname); 699 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(), 700 u_wmsg_errorName(err)); 701 goto error_exit; 702 } 703 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); 704 if (U_FAILURE(err)) { 705 initMsg(pname); 706 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 707 goto error_exit; 708 } 709 710 convto = ucnv_open(tocpage, &err); 711 if (U_FAILURE(err)) { 712 UnicodeString str(tocpage, ""); 713 initMsg(pname); 714 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(), 715 u_wmsg_errorName(err)); 716 goto error_exit; 717 } 718 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); 719 if (U_FAILURE(err)) { 720 initMsg(pname); 721 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 722 goto error_exit; 723 } 724 ucnv_setFallback(convto, fallback); 725 726 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode; 727 int8_t sig; 728 729 // OK, we can convert now. 730 sig = signature; 731 rd = 0; 732 733 do { 734 willexit = FALSE; 735 736 // input file offset at the beginning of the next buffer 737 infoffset += rd; 738 739 rd = fread(buf, 1, bufsz, infile); 740 if (ferror(infile) != 0) { 741 UnicodeString str(strerror(errno)); 742 initMsg(pname); 743 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer()); 744 goto error_exit; 745 } 746 747 // Convert the read buffer into the new encoding via Unicode. 748 // After the call 'unibufp' will be placed behind the last 749 // character that was converted in the 'unibuf'. 750 // Also the 'cbufp' is positioned behind the last converted 751 // character. 752 // At the last conversion in the file, flush should be set to 753 // true so that we get all characters converted. 754 // 755 // The converter must be flushed at the end of conversion so 756 // that characters on hold also will be written. 757 758 cbufp = buf; 759 flush = (UBool)(rd != bufsz); 760 761 // convert until the input is consumed 762 do { 763 // remember the start of the current byte-to-Unicode conversion 764 prevbufp = cbufp; 765 766 unibuf = unibufp = u.getBuffer((int32_t)bufsz); 767 768 // Use bufsz instead of u.getCapacity() for the targetLimit 769 // so that we don't overflow fromoffsets[]. 770 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp, 771 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err); 772 773 ulen = (int32_t)(unibufp - unibuf); 774 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0); 775 776 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done 777 // converting all of the input bytes. 778 // It works like this because ucnv_toUnicode() returns only under the 779 // following conditions: 780 // - an error occurred during conversion (an error code is set) 781 // - the target buffer is filled (the error code indicates an overflow) 782 // - the source is consumed 783 // That is, if the error code does not indicate a failure, 784 // not even an overflow, then the source must be consumed entirely. 785 fromSawEndOfBytes = (UBool)U_SUCCESS(err); 786 787 if (err == U_BUFFER_OVERFLOW_ERROR) { 788 err = U_ZERO_ERROR; 789 } else if (U_FAILURE(err)) { 790 char pos[32], errorBytes[32]; 791 int8_t i, length, errorLength; 792 793 UErrorCode localError = U_ZERO_ERROR; 794 errorLength = (int8_t)sizeof(errorBytes); 795 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError); 796 if (U_FAILURE(localError) || errorLength == 0) { 797 errorLength = 1; 798 } 799 800 // print the input file offset of the start of the error bytes: 801 // input file offset of the current byte buffer + 802 // length of the just consumed bytes - 803 // length of the error bytes 804 length = 805 (int8_t)sprintf(pos, "%d", 806 (int)(infoffset + (cbufp - buf) - errorLength)); 807 808 // output the bytes that caused the error 809 UnicodeString str; 810 for (i = 0; i < errorLength; ++i) { 811 if (i > 0) { 812 str.append((UChar)uSP); 813 } 814 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4)); 815 str.append(nibbleToHex((uint8_t)errorBytes[i])); 816 } 817 818 initMsg(pname); 819 u_wmsg(stderr, "problemCvtToU", 820 UnicodeString(pos, length, "").getTerminatedBuffer(), 821 str.getTerminatedBuffer(), 822 u_wmsg_errorName(err)); 823 824 willexit = TRUE; 825 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 826 } 827 828 // Replaced a check for whether the input was consumed by 829 // looping until it is; message key "premEndInput" now obsolete. 830 831 if (ulen == 0) { 832 continue; 833 } 834 835 // remove a U+FEFF Unicode signature character if requested 836 if (sig < 0) { 837 if (u.charAt(0) == uSig) { 838 u.remove(0, 1); 839 840 // account for the removed UChar and offset 841 --ulen; 842 843 if (useOffsets) { 844 // remove an offset from fromoffsets[] as well 845 // to keep the array parallel with the UChars 846 memmove(fromoffsets, fromoffsets + 1, ulen * 4); 847 } 848 849 } 850 sig = 0; 851 } 852 853#if !UCONFIG_NO_TRANSLITERATION 854 // Transliterate/transform if needed. 855 856 // For transformation, we use chunking code - 857 // collect Unicode input until, for example, an end-of-line, 858 // then transform and output-convert that and continue collecting. 859 // This makes the transformation result independent of the buffer size 860 // while avoiding the slower keyboard mode. 861 // The end-of-chunk characters are completely included in the 862 // transformed string in case they are to be transformed themselves. 863 if (t != NULL) { 864 UnicodeString out; 865 int32_t chunkLimit; 866 867 do { 868 chunkLimit = getChunkLimit(chunk, u); 869 if (chunkLimit < 0 && flush && fromSawEndOfBytes) { 870 // use all of the rest at the end of the text 871 chunkLimit = u.length(); 872 } 873 if (chunkLimit >= 0) { 874 // complete the chunk and transform it 875 chunk.append(u, 0, chunkLimit); 876 u.remove(0, chunkLimit); 877 t->transliterate(chunk); 878 879 // append the transformation result to the result and empty the chunk 880 out.append(chunk); 881 chunk.remove(); 882 } else { 883 // continue collecting the chunk 884 chunk.append(u); 885 break; 886 } 887 } while (!u.isEmpty()); 888 889 u = out; 890 ulen = u.length(); 891 } 892#endif 893 894 // add a U+FEFF Unicode signature character if requested 895 // and possible/necessary 896 if (sig > 0) { 897 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) { 898 u.insert(0, (UChar)uSig); 899 900 if (useOffsets) { 901 // insert a pseudo-offset into fromoffsets[] as well 902 // to keep the array parallel with the UChars 903 memmove(fromoffsets + 1, fromoffsets, ulen * 4); 904 fromoffsets[0] = -1; 905 } 906 907 // account for the additional UChar and offset 908 ++ulen; 909 } 910 sig = 0; 911 } 912 913 // Convert the Unicode buffer into the destination codepage 914 // Again 'bufp' will be placed behind the last converted character 915 // And 'unibufp' will be placed behind the last converted unicode character 916 // At the last conversion flush should be set to true to ensure that 917 // all characters left get converted 918 919 unibuf = unibufbp = u.getBuffer(); 920 921 do { 922 bufp = outbuf; 923 924 // Use fromSawEndOfBytes in addition to the flush flag - 925 // it indicates whether the intermediate Unicode string 926 // contains the very last UChars for the very last input bytes. 927 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz, 928 &unibufbp, 929 unibuf + ulen, 930 NULL, (UBool)(flush && fromSawEndOfBytes), &err); 931 932 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done 933 // converting all of the intermediate UChars. 934 // See comment for fromSawEndOfBytes. 935 toSawEndOfUnicode = (UBool)U_SUCCESS(err); 936 937 if (err == U_BUFFER_OVERFLOW_ERROR) { 938 err = U_ZERO_ERROR; 939 } else if (U_FAILURE(err)) { 940 UChar errorUChars[4]; 941 const char *errtag; 942 char pos[32]; 943 UChar32 c; 944 int8_t i, length, errorLength; 945 946 UErrorCode localError = U_ZERO_ERROR; 947 errorLength = (int8_t)UPRV_LENGTHOF(errorUChars); 948 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError); 949 if (U_FAILURE(localError) || errorLength == 0) { 950 // need at least 1 so that we don't access beyond the length of fromoffsets[] 951 errorLength = 1; 952 } 953 954 int32_t ferroffset; 955 956 if (useOffsets) { 957 // Unicode buffer offset of the start of the error UChars 958 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength); 959 if (ferroffset < 0) { 960 // approximation - the character started in the previous Unicode buffer 961 ferroffset = 0; 962 } 963 964 // get the corresponding byte offset out of fromoffsets[] 965 // go back if the offset is not known for some of the UChars 966 int32_t fromoffset; 967 do { 968 fromoffset = fromoffsets[ferroffset]; 969 } while (fromoffset < 0 && --ferroffset >= 0); 970 971 // total input file offset = 972 // input file offset of the current byte buffer + 973 // byte buffer offset of where the current Unicode buffer is converted from + 974 // fromoffsets[Unicode offset] 975 ferroffset = infoffset + (prevbufp - buf) + fromoffset; 976 errtag = "problemCvtFromU"; 977 } else { 978 // Do not use fromoffsets if (t != NULL) because the Unicode text may 979 // be different from what the offsets refer to. 980 981 // output file offset 982 ferroffset = (int32_t)(outfoffset + (bufp - outbuf)); 983 errtag = "problemCvtFromUOut"; 984 } 985 986 length = (int8_t)sprintf(pos, "%u", (int)ferroffset); 987 988 // output the code points that caused the error 989 UnicodeString str; 990 for (i = 0; i < errorLength;) { 991 if (i > 0) { 992 str.append((UChar)uSP); 993 } 994 U16_NEXT(errorUChars, i, errorLength, c); 995 if (c >= 0x100000) { 996 str.append(nibbleToHex((uint8_t)(c >> 20))); 997 } 998 if (c >= 0x10000) { 999 str.append(nibbleToHex((uint8_t)(c >> 16))); 1000 } 1001 str.append(nibbleToHex((uint8_t)(c >> 12))); 1002 str.append(nibbleToHex((uint8_t)(c >> 8))); 1003 str.append(nibbleToHex((uint8_t)(c >> 4))); 1004 str.append(nibbleToHex((uint8_t)c)); 1005 } 1006 1007 initMsg(pname); 1008 u_wmsg(stderr, errtag, 1009 UnicodeString(pos, length, "").getTerminatedBuffer(), 1010 str.getTerminatedBuffer(), 1011 u_wmsg_errorName(err)); 1012 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer()); 1013 1014 willexit = TRUE; 1015 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 1016 } 1017 1018 // Replaced a check for whether the intermediate Unicode characters were all consumed by 1019 // looping until they are; message key "premEnd" now obsolete. 1020 1021 // Finally, write the converted buffer to the output file 1022 size_t outlen = (size_t) (bufp - outbuf); 1023 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile)); 1024 if (wr != outlen) { 1025 UnicodeString str(strerror(errno)); 1026 initMsg(pname); 1027 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer()); 1028 willexit = TRUE; 1029 } 1030 1031 if (willexit) { 1032 goto error_exit; 1033 } 1034 } while (!toSawEndOfUnicode); 1035 } while (!fromSawEndOfBytes); 1036 } while (!flush); // Stop when we have flushed the 1037 // converters (this means that it's 1038 // the end of output) 1039 1040 goto normal_exit; 1041 1042error_exit: 1043 ret = FALSE; 1044 1045normal_exit: 1046 // Cleanup. 1047 1048 ucnv_close(convfrom); 1049 ucnv_close(convto); 1050 1051#if !UCONFIG_NO_TRANSLITERATION 1052 delete t; 1053#endif 1054 1055 if (closeFile) { 1056 fclose(infile); 1057 } 1058 1059 return ret; 1060} 1061 1062static void usage(const char *pname, int ecode) { 1063 const UChar *msg; 1064 int32_t msgLen; 1065 UErrorCode err = U_ZERO_ERROR; 1066 FILE *fp = ecode ? stderr : stdout; 1067 int res; 1068 1069 initMsg(pname); 1070 msg = 1071 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", 1072 &msgLen, &err); 1073 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1)); 1074 UnicodeString mname(msg, msgLen + 1); 1075 1076 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer()); 1077 if (!ecode) { 1078 if (!res) { 1079 fputc('\n', fp); 1080 } 1081 if (!u_wmsg(fp, "help")) { 1082 /* Now dump callbacks and finish. */ 1083 1084 int i, count = 1085 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 1086 for (i = 0; i < count; ++i) { 1087 fprintf(fp, " %s", transcode_callbacks[i].name); 1088 } 1089 fputc('\n', fp); 1090 } 1091 } 1092 1093 exit(ecode); 1094} 1095 1096extern int 1097main(int argc, char **argv) 1098{ 1099 FILE *outfile; 1100 int ret = 0; 1101 1102 size_t bufsz = DEFAULT_BUFSZ; 1103 1104 const char *fromcpage = 0; 1105 const char *tocpage = 0; 1106 const char *translit = 0; 1107 const char *outfilestr = 0; 1108 UBool fallback = FALSE; 1109 1110 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP; 1111 const void *fromuctxt = 0; 1112 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP; 1113 const void *touctxt = 0; 1114 1115 char **iter, **remainArgv, **remainArgvLimit; 1116 char **end = argv + argc; 1117 1118 const char *pname; 1119 1120 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE; 1121 const char *printName = 0; 1122 1123 UBool verbose = FALSE; 1124 UErrorCode status = U_ZERO_ERROR; 1125 1126 ConvertFile cf; 1127 1128 /* Initialize ICU */ 1129 u_init(&status); 1130 if (U_FAILURE(status)) { 1131 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 1132 argv[0], u_errorName(status)); 1133 exit(1); 1134 } 1135 1136 // Get and prettify pname. 1137 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); 1138#if U_PLATFORM_USES_ONLY_WIN32_API 1139 if (!pname) { 1140 pname = uprv_strrchr(*argv, '/'); 1141 } 1142#endif 1143 if (!pname) { 1144 pname = *argv; 1145 } else { 1146 ++pname; 1147 } 1148 1149 // First, get the arguments from command-line 1150 // to know the codepages to convert between 1151 1152 remainArgv = remainArgvLimit = argv + 1; 1153 for (iter = argv + 1; iter != end; iter++) { 1154 // Check for from charset 1155 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) { 1156 iter++; 1157 if (iter != end) 1158 fromcpage = *iter; 1159 else 1160 usage(pname, 1); 1161 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) { 1162 iter++; 1163 if (iter != end) 1164 tocpage = *iter; 1165 else 1166 usage(pname, 1); 1167 } else if (strcmp("-x", *iter) == 0) { 1168 iter++; 1169 if (iter != end) 1170 translit = *iter; 1171 else 1172 usage(pname, 1); 1173 } else if (!strcmp("--fallback", *iter)) { 1174 fallback = TRUE; 1175 } else if (!strcmp("--no-fallback", *iter)) { 1176 fallback = FALSE; 1177 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) { 1178 iter++; 1179 if (iter != end) { 1180 bufsz = atoi(*iter); 1181 if ((int) bufsz <= 0) { 1182 initMsg(pname); 1183 UnicodeString str(*iter); 1184 initMsg(pname); 1185 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer()); 1186 return 3; 1187 } 1188 } else { 1189 usage(pname, 1); 1190 } 1191 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) { 1192 if (printTranslits) { 1193 usage(pname, 1); 1194 } 1195 printConvs = TRUE; 1196 } else if (strcmp("--default-code", *iter) == 0) { 1197 if (printTranslits) { 1198 usage(pname, 1); 1199 } 1200 printName = ucnv_getDefaultName(); 1201 } else if (strcmp("--list-code", *iter) == 0) { 1202 if (printTranslits) { 1203 usage(pname, 1); 1204 } 1205 1206 iter++; 1207 if (iter != end) { 1208 UErrorCode e = U_ZERO_ERROR; 1209 printName = ucnv_getAlias(*iter, 0, &e); 1210 if (U_FAILURE(e) || !printName) { 1211 UnicodeString str(*iter); 1212 initMsg(pname); 1213 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer()); 1214 return 2; 1215 } 1216 } else 1217 usage(pname, 1); 1218 } else if (strcmp("--canon", *iter) == 0) { 1219 printCanon = TRUE; 1220 } else if (strcmp("-L", *iter) == 0 1221 || !strcmp("--list-transliterators", *iter)) { 1222 if (printConvs) { 1223 usage(pname, 1); 1224 } 1225 printTranslits = TRUE; 1226 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter) 1227 || !strcmp("--help", *iter)) { 1228 usage(pname, 0); 1229 } else if (!strcmp("-c", *iter)) { 1230 fromucallback = UCNV_FROM_U_CALLBACK_SKIP; 1231 } else if (!strcmp("--to-callback", *iter)) { 1232 iter++; 1233 if (iter != end) { 1234 const struct callback_ent *cbe = findCallback(*iter); 1235 if (cbe) { 1236 fromucallback = cbe->fromu; 1237 fromuctxt = cbe->fromuctxt; 1238 } else { 1239 UnicodeString str(*iter); 1240 initMsg(pname); 1241 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1242 return 4; 1243 } 1244 } else { 1245 usage(pname, 1); 1246 } 1247 } else if (!strcmp("--from-callback", *iter)) { 1248 iter++; 1249 if (iter != end) { 1250 const struct callback_ent *cbe = findCallback(*iter); 1251 if (cbe) { 1252 toucallback = cbe->tou; 1253 touctxt = cbe->touctxt; 1254 } else { 1255 UnicodeString str(*iter); 1256 initMsg(pname); 1257 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1258 return 4; 1259 } 1260 } else { 1261 usage(pname, 1); 1262 } 1263 } else if (!strcmp("-i", *iter)) { 1264 toucallback = UCNV_TO_U_CALLBACK_SKIP; 1265 } else if (!strcmp("--callback", *iter)) { 1266 iter++; 1267 if (iter != end) { 1268 const struct callback_ent *cbe = findCallback(*iter); 1269 if (cbe) { 1270 fromucallback = cbe->fromu; 1271 fromuctxt = cbe->fromuctxt; 1272 toucallback = cbe->tou; 1273 touctxt = cbe->touctxt; 1274 } else { 1275 UnicodeString str(*iter); 1276 initMsg(pname); 1277 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1278 return 4; 1279 } 1280 } else { 1281 usage(pname, 1); 1282 } 1283 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) { 1284 verbose = FALSE; 1285 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) { 1286 verbose = TRUE; 1287 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) { 1288 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname); 1289 return 0; 1290 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) { 1291 ++iter; 1292 if (iter != end && !outfilestr) { 1293 outfilestr = *iter; 1294 } else { 1295 usage(pname, 1); 1296 } 1297 } else if (0 == strcmp("--add-signature", *iter)) { 1298 cf.signature = 1; 1299 } else if (0 == strcmp("--remove-signature", *iter)) { 1300 cf.signature = -1; 1301 } else if (**iter == '-' && (*iter)[1]) { 1302 usage(pname, 1); 1303 } else { 1304 // move a non-option up in argv[] 1305 *remainArgvLimit++ = *iter; 1306 } 1307 } 1308 1309 if (printConvs || printName) { 1310 return printConverters(pname, printName, printCanon) ? 2 : 0; 1311 } else if (printTranslits) { 1312 return printTransliterators(printCanon) ? 3 : 0; 1313 } 1314 1315 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) { 1316 fromcpage = ucnv_getDefaultName(); 1317 } 1318 if (!tocpage || !uprv_strcmp(tocpage, "-")) { 1319 tocpage = ucnv_getDefaultName(); 1320 } 1321 1322 // Open the correct output file or connect to stdout for reading input 1323 if (outfilestr != 0 && strcmp(outfilestr, "-")) { 1324 outfile = fopen(outfilestr, "wb"); 1325 if (outfile == 0) { 1326 UnicodeString str1(outfilestr, ""); 1327 UnicodeString str2(strerror(errno), ""); 1328 initMsg(pname); 1329 u_wmsg(stderr, "cantCreateOutputF", 1330 str1.getBuffer(), str2.getBuffer()); 1331 return 1; 1332 } 1333 } else { 1334 outfilestr = "-"; 1335 outfile = stdout; 1336#ifdef USE_FILENO_BINARY_MODE 1337 if (setmode(fileno(outfile), O_BINARY) == -1) { 1338 u_wmsg(stderr, "cantSetOutBinMode"); 1339 exit(-1); 1340 } 1341#endif 1342 } 1343 1344 /* Loop again on the arguments to find all the input files, and 1345 convert them. */ 1346 1347 cf.setBufferSize(bufsz); 1348 1349 if(remainArgv < remainArgvLimit) { 1350 for (iter = remainArgv; iter != remainArgvLimit; iter++) { 1351 if (!cf.convertFile( 1352 pname, fromcpage, toucallback, touctxt, tocpage, 1353 fromucallback, fromuctxt, fallback, translit, *iter, 1354 outfile, verbose) 1355 ) { 1356 goto error_exit; 1357 } 1358 } 1359 } else { 1360 if (!cf.convertFile( 1361 pname, fromcpage, toucallback, touctxt, tocpage, 1362 fromucallback, fromuctxt, fallback, translit, 0, 1363 outfile, verbose) 1364 ) { 1365 goto error_exit; 1366 } 1367 } 1368 1369 goto normal_exit; 1370error_exit: 1371#if !UCONFIG_NO_LEGACY_CONVERSION 1372 ret = 1; 1373#else 1374 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n"); 1375#endif 1376normal_exit: 1377 1378 if (outfile != stdout) { 1379 fclose(outfile); 1380 } 1381 1382 u_cleanup(); 1383 1384 return ret; 1385} 1386 1387 1388/* 1389 * Hey, Emacs, please set the following: 1390 * 1391 * Local Variables: 1392 * indent-tabs-mode: nil 1393 * End: 1394 * 1395 */ 1396