CharsetUtil.java revision 7dbeb7d91c7b3970426af6debe48301ba053fd79
1/**************************************************************** 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 ****************************************************************/ 19 20package org.apache.james.mime4j.util; 21 22import java.io.UnsupportedEncodingException; 23import java.nio.charset.IllegalCharsetNameException; 24import java.nio.charset.UnsupportedCharsetException; 25import java.util.HashMap; 26import java.util.TreeSet; 27 28import org.apache.commons.logging.Log; 29import org.apache.commons.logging.LogFactory; 30 31/** 32 * Utility class for working with character sets. It is somewhat similar to 33 * the Java 1.4 <code>java.nio.charset.Charset</code> class but knows many 34 * more aliases and is compatible with Java 1.3. It will use a simple detection 35 * mechanism to detect what character sets the current VM supports. This will 36 * be a sub-set of the character sets listed in the 37 * <a href="http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html"> 38 * Java 1.5 (J2SE5.0) Supported Encodings</a> document. 39 * <p> 40 * The <a href="http://www.iana.org/assignments/character-sets"> 41 * IANA Character Sets</a> document has been used to determine the preferred 42 * MIME character set names and to get a list of known aliases. 43 * <p> 44 * This is a complete list of the character sets known to this class: 45 * <table> 46 * <tr> 47 * <td>Canonical (Java) name</td> 48 * <td>MIME preferred</td> 49 * <td>Aliases</td> 50 * </tr> 51 * <tr> 52 * <td>ASCII</td> 53 * <td>US-ASCII</td> 54 * <td>ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 </td> 55 * </tr> 56 * <tr> 57 * <td>Big5</td> 58 * <td>Big5</td> 59 * <td>csBig5 CN-Big5 BIG-FIVE BIGFIVE </td> 60 * </tr> 61 * <tr> 62 * <td>Big5_HKSCS</td> 63 * <td>Big5-HKSCS</td> 64 * <td>big5hkscs </td> 65 * </tr> 66 * <tr> 67 * <td>Big5_Solaris</td> 68 * <td>?</td> 69 * <td></td> 70 * </tr> 71 * <tr> 72 * <td>Cp037</td> 73 * <td>IBM037</td> 74 * <td>ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 </td> 75 * </tr> 76 * <tr> 77 * <td>Cp1006</td> 78 * <td>?</td> 79 * <td></td> 80 * </tr> 81 * <tr> 82 * <td>Cp1025</td> 83 * <td>?</td> 84 * <td></td> 85 * </tr> 86 * <tr> 87 * <td>Cp1026</td> 88 * <td>IBM1026</td> 89 * <td>csIBM1026 </td> 90 * </tr> 91 * <tr> 92 * <td>Cp1046</td> 93 * <td>?</td> 94 * <td></td> 95 * </tr> 96 * <tr> 97 * <td>Cp1047</td> 98 * <td>IBM1047</td> 99 * <td>IBM-1047 </td> 100 * </tr> 101 * <tr> 102 * <td>Cp1097</td> 103 * <td>?</td> 104 * <td></td> 105 * </tr> 106 * <tr> 107 * <td>Cp1098</td> 108 * <td>?</td> 109 * <td></td> 110 * </tr> 111 * <tr> 112 * <td>Cp1112</td> 113 * <td>?</td> 114 * <td></td> 115 * </tr> 116 * <tr> 117 * <td>Cp1122</td> 118 * <td>?</td> 119 * <td></td> 120 * </tr> 121 * <tr> 122 * <td>Cp1123</td> 123 * <td>?</td> 124 * <td></td> 125 * </tr> 126 * <tr> 127 * <td>Cp1124</td> 128 * <td>?</td> 129 * <td></td> 130 * </tr> 131 * <tr> 132 * <td>Cp1140</td> 133 * <td>IBM01140</td> 134 * <td>CCSID01140 CP01140 ebcdic-us-37+euro </td> 135 * </tr> 136 * <tr> 137 * <td>Cp1141</td> 138 * <td>IBM01141</td> 139 * <td>CCSID01141 CP01141 ebcdic-de-273+euro </td> 140 * </tr> 141 * <tr> 142 * <td>Cp1142</td> 143 * <td>IBM01142</td> 144 * <td>CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro </td> 145 * </tr> 146 * <tr> 147 * <td>Cp1143</td> 148 * <td>IBM01143</td> 149 * <td>CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro </td> 150 * </tr> 151 * <tr> 152 * <td>Cp1144</td> 153 * <td>IBM01144</td> 154 * <td>CCSID01144 CP01144 ebcdic-it-280+euro </td> 155 * </tr> 156 * <tr> 157 * <td>Cp1145</td> 158 * <td>IBM01145</td> 159 * <td>CCSID01145 CP01145 ebcdic-es-284+euro </td> 160 * </tr> 161 * <tr> 162 * <td>Cp1146</td> 163 * <td>IBM01146</td> 164 * <td>CCSID01146 CP01146 ebcdic-gb-285+euro </td> 165 * </tr> 166 * <tr> 167 * <td>Cp1147</td> 168 * <td>IBM01147</td> 169 * <td>CCSID01147 CP01147 ebcdic-fr-297+euro </td> 170 * </tr> 171 * <tr> 172 * <td>Cp1148</td> 173 * <td>IBM01148</td> 174 * <td>CCSID01148 CP01148 ebcdic-international-500+euro </td> 175 * </tr> 176 * <tr> 177 * <td>Cp1149</td> 178 * <td>IBM01149</td> 179 * <td>CCSID01149 CP01149 ebcdic-is-871+euro </td> 180 * </tr> 181 * <tr> 182 * <td>Cp1250</td> 183 * <td>windows-1250</td> 184 * <td></td> 185 * </tr> 186 * <tr> 187 * <td>Cp1251</td> 188 * <td>windows-1251</td> 189 * <td></td> 190 * </tr> 191 * <tr> 192 * <td>Cp1252</td> 193 * <td>windows-1252</td> 194 * <td></td> 195 * </tr> 196 * <tr> 197 * <td>Cp1253</td> 198 * <td>windows-1253</td> 199 * <td></td> 200 * </tr> 201 * <tr> 202 * <td>Cp1254</td> 203 * <td>windows-1254</td> 204 * <td></td> 205 * </tr> 206 * <tr> 207 * <td>Cp1255</td> 208 * <td>windows-1255</td> 209 * <td></td> 210 * </tr> 211 * <tr> 212 * <td>Cp1256</td> 213 * <td>windows-1256</td> 214 * <td></td> 215 * </tr> 216 * <tr> 217 * <td>Cp1257</td> 218 * <td>windows-1257</td> 219 * <td></td> 220 * </tr> 221 * <tr> 222 * <td>Cp1258</td> 223 * <td>windows-1258</td> 224 * <td></td> 225 * </tr> 226 * <tr> 227 * <td>Cp1381</td> 228 * <td>?</td> 229 * <td></td> 230 * </tr> 231 * <tr> 232 * <td>Cp1383</td> 233 * <td>?</td> 234 * <td></td> 235 * </tr> 236 * <tr> 237 * <td>Cp273</td> 238 * <td>IBM273</td> 239 * <td>csIBM273 </td> 240 * </tr> 241 * <tr> 242 * <td>Cp277</td> 243 * <td>IBM277</td> 244 * <td>EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 </td> 245 * </tr> 246 * <tr> 247 * <td>Cp278</td> 248 * <td>IBM278</td> 249 * <td>CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 </td> 250 * </tr> 251 * <tr> 252 * <td>Cp280</td> 253 * <td>IBM280</td> 254 * <td>ebcdic-cp-it csIBM280 </td> 255 * </tr> 256 * <tr> 257 * <td>Cp284</td> 258 * <td>IBM284</td> 259 * <td>ebcdic-cp-es csIBM284 </td> 260 * </tr> 261 * <tr> 262 * <td>Cp285</td> 263 * <td>IBM285</td> 264 * <td>ebcdic-cp-gb csIBM285 </td> 265 * </tr> 266 * <tr> 267 * <td>Cp297</td> 268 * <td>IBM297</td> 269 * <td>ebcdic-cp-fr csIBM297 </td> 270 * </tr> 271 * <tr> 272 * <td>Cp33722</td> 273 * <td>?</td> 274 * <td></td> 275 * </tr> 276 * <tr> 277 * <td>Cp420</td> 278 * <td>IBM420</td> 279 * <td>ebcdic-cp-ar1 csIBM420 </td> 280 * </tr> 281 * <tr> 282 * <td>Cp424</td> 283 * <td>IBM424</td> 284 * <td>ebcdic-cp-he csIBM424 </td> 285 * </tr> 286 * <tr> 287 * <td>Cp437</td> 288 * <td>IBM437</td> 289 * <td>437 csPC8CodePage437 </td> 290 * </tr> 291 * <tr> 292 * <td>Cp500</td> 293 * <td>IBM500</td> 294 * <td>ebcdic-cp-be ebcdic-cp-ch csIBM500 </td> 295 * </tr> 296 * <tr> 297 * <td>Cp737</td> 298 * <td>?</td> 299 * <td></td> 300 * </tr> 301 * <tr> 302 * <td>Cp775</td> 303 * <td>IBM775</td> 304 * <td>csPC775Baltic </td> 305 * </tr> 306 * <tr> 307 * <td>Cp838</td> 308 * <td>IBM-Thai</td> 309 * <td></td> 310 * </tr> 311 * <tr> 312 * <td>Cp850</td> 313 * <td>IBM850</td> 314 * <td>850 csPC850Multilingual </td> 315 * </tr> 316 * <tr> 317 * <td>Cp852</td> 318 * <td>IBM852</td> 319 * <td>852 csPCp852 </td> 320 * </tr> 321 * <tr> 322 * <td>Cp855</td> 323 * <td>IBM855</td> 324 * <td>855 csIBM855 </td> 325 * </tr> 326 * <tr> 327 * <td>Cp856</td> 328 * <td>?</td> 329 * <td></td> 330 * </tr> 331 * <tr> 332 * <td>Cp857</td> 333 * <td>IBM857</td> 334 * <td>857 csIBM857 </td> 335 * </tr> 336 * <tr> 337 * <td>Cp858</td> 338 * <td>IBM00858</td> 339 * <td>CCSID00858 CP00858 PC-Multilingual-850+euro </td> 340 * </tr> 341 * <tr> 342 * <td>Cp860</td> 343 * <td>IBM860</td> 344 * <td>860 csIBM860 </td> 345 * </tr> 346 * <tr> 347 * <td>Cp861</td> 348 * <td>IBM861</td> 349 * <td>861 cp-is csIBM861 </td> 350 * </tr> 351 * <tr> 352 * <td>Cp862</td> 353 * <td>IBM862</td> 354 * <td>862 csPC862LatinHebrew </td> 355 * </tr> 356 * <tr> 357 * <td>Cp863</td> 358 * <td>IBM863</td> 359 * <td>863 csIBM863 </td> 360 * </tr> 361 * <tr> 362 * <td>Cp864</td> 363 * <td>IBM864</td> 364 * <td>cp864 csIBM864 </td> 365 * </tr> 366 * <tr> 367 * <td>Cp865</td> 368 * <td>IBM865</td> 369 * <td>865 csIBM865 </td> 370 * </tr> 371 * <tr> 372 * <td>Cp866</td> 373 * <td>IBM866</td> 374 * <td>866 csIBM866 </td> 375 * </tr> 376 * <tr> 377 * <td>Cp868</td> 378 * <td>IBM868</td> 379 * <td>cp-ar csIBM868 </td> 380 * </tr> 381 * <tr> 382 * <td>Cp869</td> 383 * <td>IBM869</td> 384 * <td>cp-gr csIBM869 </td> 385 * </tr> 386 * <tr> 387 * <td>Cp870</td> 388 * <td>IBM870</td> 389 * <td>ebcdic-cp-roece ebcdic-cp-yu csIBM870 </td> 390 * </tr> 391 * <tr> 392 * <td>Cp871</td> 393 * <td>IBM871</td> 394 * <td>ebcdic-cp-is csIBM871 </td> 395 * </tr> 396 * <tr> 397 * <td>Cp875</td> 398 * <td>?</td> 399 * <td></td> 400 * </tr> 401 * <tr> 402 * <td>Cp918</td> 403 * <td>IBM918</td> 404 * <td>ebcdic-cp-ar2 csIBM918 </td> 405 * </tr> 406 * <tr> 407 * <td>Cp921</td> 408 * <td>?</td> 409 * <td></td> 410 * </tr> 411 * <tr> 412 * <td>Cp922</td> 413 * <td>?</td> 414 * <td></td> 415 * </tr> 416 * <tr> 417 * <td>Cp930</td> 418 * <td>?</td> 419 * <td></td> 420 * </tr> 421 * <tr> 422 * <td>Cp933</td> 423 * <td>?</td> 424 * <td></td> 425 * </tr> 426 * <tr> 427 * <td>Cp935</td> 428 * <td>?</td> 429 * <td></td> 430 * </tr> 431 * <tr> 432 * <td>Cp937</td> 433 * <td>?</td> 434 * <td></td> 435 * </tr> 436 * <tr> 437 * <td>Cp939</td> 438 * <td>?</td> 439 * <td></td> 440 * </tr> 441 * <tr> 442 * <td>Cp942</td> 443 * <td>?</td> 444 * <td></td> 445 * </tr> 446 * <tr> 447 * <td>Cp942C</td> 448 * <td>?</td> 449 * <td></td> 450 * </tr> 451 * <tr> 452 * <td>Cp943</td> 453 * <td>?</td> 454 * <td></td> 455 * </tr> 456 * <tr> 457 * <td>Cp943C</td> 458 * <td>?</td> 459 * <td></td> 460 * </tr> 461 * <tr> 462 * <td>Cp948</td> 463 * <td>?</td> 464 * <td></td> 465 * </tr> 466 * <tr> 467 * <td>Cp949</td> 468 * <td>?</td> 469 * <td></td> 470 * </tr> 471 * <tr> 472 * <td>Cp949C</td> 473 * <td>?</td> 474 * <td></td> 475 * </tr> 476 * <tr> 477 * <td>Cp950</td> 478 * <td>?</td> 479 * <td></td> 480 * </tr> 481 * <tr> 482 * <td>Cp964</td> 483 * <td>?</td> 484 * <td></td> 485 * </tr> 486 * <tr> 487 * <td>Cp970</td> 488 * <td>?</td> 489 * <td></td> 490 * </tr> 491 * <tr> 492 * <td>EUC_CN</td> 493 * <td>GB2312</td> 494 * <td>x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 </td> 495 * </tr> 496 * <tr> 497 * <td>EUC_JP</td> 498 * <td>EUC-JP</td> 499 * <td>csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp </td> 500 * </tr> 501 * <tr> 502 * <td>EUC_JP_LINUX</td> 503 * <td>?</td> 504 * <td></td> 505 * </tr> 506 * <tr> 507 * <td>EUC_JP_Solaris</td> 508 * <td>?</td> 509 * <td></td> 510 * </tr> 511 * <tr> 512 * <td>EUC_KR</td> 513 * <td>EUC-KR</td> 514 * <td>csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr </td> 515 * </tr> 516 * <tr> 517 * <td>EUC_TW</td> 518 * <td>EUC-TW</td> 519 * <td>x-EUC-TW cns11643 euctw </td> 520 * </tr> 521 * <tr> 522 * <td>GB18030</td> 523 * <td>GB18030</td> 524 * <td>gb18030-2000 </td> 525 * </tr> 526 * <tr> 527 * <td>GBK</td> 528 * <td>windows-936</td> 529 * <td>CP936 MS936 ms_936 x-mswin-936 </td> 530 * </tr> 531 * <tr> 532 * <td>ISCII91</td> 533 * <td>?</td> 534 * <td>x-ISCII91 iscii </td> 535 * </tr> 536 * <tr> 537 * <td>ISO2022CN</td> 538 * <td>ISO-2022-CN</td> 539 * <td></td> 540 * </tr> 541 * <tr> 542 * <td>ISO2022JP</td> 543 * <td>ISO-2022-JP</td> 544 * <td>csISO2022JP JIS jis_encoding csjisencoding </td> 545 * </tr> 546 * <tr> 547 * <td>ISO2022KR</td> 548 * <td>ISO-2022-KR</td> 549 * <td>csISO2022KR </td> 550 * </tr> 551 * <tr> 552 * <td>ISO2022_CN_CNS</td> 553 * <td>?</td> 554 * <td></td> 555 * </tr> 556 * <tr> 557 * <td>ISO2022_CN_GB</td> 558 * <td>?</td> 559 * <td></td> 560 * </tr> 561 * <tr> 562 * <td>ISO8859_1</td> 563 * <td>ISO-8859-1</td> 564 * <td>ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 </td> 565 * </tr> 566 * <tr> 567 * <td>ISO8859_13</td> 568 * <td>ISO-8859-13</td> 569 * <td></td> 570 * </tr> 571 * <tr> 572 * <td>ISO8859_15</td> 573 * <td>ISO-8859-15</td> 574 * <td>ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS </td> 575 * </tr> 576 * <tr> 577 * <td>ISO8859_2</td> 578 * <td>ISO-8859-2</td> 579 * <td>ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 </td> 580 * </tr> 581 * <tr> 582 * <td>ISO8859_3</td> 583 * <td>ISO-8859-3</td> 584 * <td>ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 </td> 585 * </tr> 586 * <tr> 587 * <td>ISO8859_4</td> 588 * <td>ISO-8859-4</td> 589 * <td>ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 </td> 590 * </tr> 591 * <tr> 592 * <td>ISO8859_5</td> 593 * <td>ISO-8859-5</td> 594 * <td>ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 </td> 595 * </tr> 596 * <tr> 597 * <td>ISO8859_6</td> 598 * <td>ISO-8859-6</td> 599 * <td>ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 </td> 600 * </tr> 601 * <tr> 602 * <td>ISO8859_7</td> 603 * <td>ISO-8859-7</td> 604 * <td>ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek </td> 605 * </tr> 606 * <tr> 607 * <td>ISO8859_8</td> 608 * <td>ISO-8859-8</td> 609 * <td>ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 </td> 610 * </tr> 611 * <tr> 612 * <td>ISO8859_9</td> 613 * <td>ISO-8859-9</td> 614 * <td>ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 </td> 615 * </tr> 616 * <tr> 617 * <td>JISAutoDetect</td> 618 * <td>?</td> 619 * <td></td> 620 * </tr> 621 * <tr> 622 * <td>JIS_C6626-1983</td> 623 * <td>JIS_C6626-1983</td> 624 * <td>x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 </td> 625 * </tr> 626 * <tr> 627 * <td>JIS_X0201</td> 628 * <td>JIS_X0201</td> 629 * <td>X0201 JIS0201 csHalfWidthKatakana </td> 630 * </tr> 631 * <tr> 632 * <td>JIS_X0212-1990</td> 633 * <td>JIS_X0212-1990</td> 634 * <td>iso-ir-159 x0212 JIS0212 csISO159JISX02121990 </td> 635 * </tr> 636 * <tr> 637 * <td>KOI8_R</td> 638 * <td>KOI8-R</td> 639 * <td>csKOI8R koi8 </td> 640 * </tr> 641 * <tr> 642 * <td>MS874</td> 643 * <td>windows-874</td> 644 * <td>cp874 </td> 645 * </tr> 646 * <tr> 647 * <td>MS932</td> 648 * <td>Windows-31J</td> 649 * <td>windows-932 csWindows31J x-ms-cp932 </td> 650 * </tr> 651 * <tr> 652 * <td>MS949</td> 653 * <td>windows-949</td> 654 * <td>windows949 ms_949 x-windows-949 </td> 655 * </tr> 656 * <tr> 657 * <td>MS950</td> 658 * <td>windows-950</td> 659 * <td>x-windows-950 </td> 660 * </tr> 661 * <tr> 662 * <td>MS950_HKSCS</td> 663 * <td></td> 664 * <td></td> 665 * </tr> 666 * <tr> 667 * <td>MacArabic</td> 668 * <td>?</td> 669 * <td></td> 670 * </tr> 671 * <tr> 672 * <td>MacCentralEurope</td> 673 * <td>?</td> 674 * <td></td> 675 * </tr> 676 * <tr> 677 * <td>MacCroatian</td> 678 * <td>?</td> 679 * <td></td> 680 * </tr> 681 * <tr> 682 * <td>MacCyrillic</td> 683 * <td>?</td> 684 * <td></td> 685 * </tr> 686 * <tr> 687 * <td>MacDingbat</td> 688 * <td>?</td> 689 * <td></td> 690 * </tr> 691 * <tr> 692 * <td>MacGreek</td> 693 * <td>MacGreek</td> 694 * <td></td> 695 * </tr> 696 * <tr> 697 * <td>MacHebrew</td> 698 * <td>?</td> 699 * <td></td> 700 * </tr> 701 * <tr> 702 * <td>MacIceland</td> 703 * <td>?</td> 704 * <td></td> 705 * </tr> 706 * <tr> 707 * <td>MacRoman</td> 708 * <td>MacRoman</td> 709 * <td>Macintosh MAC csMacintosh </td> 710 * </tr> 711 * <tr> 712 * <td>MacRomania</td> 713 * <td>?</td> 714 * <td></td> 715 * </tr> 716 * <tr> 717 * <td>MacSymbol</td> 718 * <td>?</td> 719 * <td></td> 720 * </tr> 721 * <tr> 722 * <td>MacThai</td> 723 * <td>?</td> 724 * <td></td> 725 * </tr> 726 * <tr> 727 * <td>MacTurkish</td> 728 * <td>?</td> 729 * <td></td> 730 * </tr> 731 * <tr> 732 * <td>MacUkraine</td> 733 * <td>?</td> 734 * <td></td> 735 * </tr> 736 * <tr> 737 * <td>SJIS</td> 738 * <td>Shift_JIS</td> 739 * <td>MS_Kanji csShiftJIS shift-jis x-sjis pck </td> 740 * </tr> 741 * <tr> 742 * <td>TIS620</td> 743 * <td>TIS-620</td> 744 * <td></td> 745 * </tr> 746 * <tr> 747 * <td>UTF-16</td> 748 * <td>UTF-16</td> 749 * <td>UTF_16 </td> 750 * </tr> 751 * <tr> 752 * <td>UTF8</td> 753 * <td>UTF-8</td> 754 * <td></td> 755 * </tr> 756 * <tr> 757 * <td>UnicodeBig</td> 758 * <td>?</td> 759 * <td></td> 760 * </tr> 761 * <tr> 762 * <td>UnicodeBigUnmarked</td> 763 * <td>UTF-16BE</td> 764 * <td>X-UTF-16BE UTF_16BE ISO-10646-UCS-2 </td> 765 * </tr> 766 * <tr> 767 * <td>UnicodeLittle</td> 768 * <td>?</td> 769 * <td></td> 770 * </tr> 771 * <tr> 772 * <td>UnicodeLittleUnmarked</td> 773 * <td>UTF-16LE</td> 774 * <td>UTF_16LE X-UTF-16LE </td> 775 * </tr> 776 * <tr> 777 * <td>x-Johab</td> 778 * <td>johab</td> 779 * <td>johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 </td> 780 * </tr> 781 * <tr> 782 * <td>x-iso-8859-11</td> 783 * <td>?</td> 784 * <td></td> 785 * </tr> 786 * </table> 787 * 788 * 789 * @version $Id: CharsetUtil.java,v 1.1 2004/10/25 07:26:46 ntherning Exp $ 790 */ 791public class CharsetUtil { 792 private static Log log = LogFactory.getLog(CharsetUtil.class); 793 794 private static class Charset implements Comparable { 795 private String canonical = null; 796 private String mime = null; 797 private String[] aliases = null; 798 799 private Charset(String canonical, String mime, String[] aliases) { 800 this.canonical = canonical; 801 this.mime = mime; 802 this.aliases = aliases; 803 } 804 805 public int compareTo(Object o) { 806 Charset c = (Charset) o; 807 return this.canonical.compareTo(c.canonical); 808 } 809 } 810 811 private static Charset[] JAVA_CHARSETS = { 812 new Charset("ISO8859_1", "ISO-8859-1", 813 new String[] {"ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", 814 "latin1", "l1", "IBM819", "CP819", 815 "csISOLatin1", "8859_1", "819", "IBM-819", 816 "ISO8859-1", "ISO_8859_1"}), 817 new Charset("ISO8859_2", "ISO-8859-2", 818 new String[] {"ISO_8859-2:1987", "iso-ir-101", "ISO_8859-2", 819 "latin2", "l2", "csISOLatin2", "8859_2", 820 "iso8859_2"}), 821 new Charset("ISO8859_3", "ISO-8859-3", new String[] {"ISO_8859-3:1988", "iso-ir-109", "ISO_8859-3", "latin3", "l3", "csISOLatin3", "8859_3"}), 822 new Charset("ISO8859_4", "ISO-8859-4", 823 new String[] {"ISO_8859-4:1988", "iso-ir-110", "ISO_8859-4", 824 "latin4", "l4", "csISOLatin4", "8859_4"}), 825 new Charset("ISO8859_5", "ISO-8859-5", 826 new String[] {"ISO_8859-5:1988", "iso-ir-144", "ISO_8859-5", 827 "cyrillic", "csISOLatinCyrillic", "8859_5"}), 828 new Charset("ISO8859_6", "ISO-8859-6", new String[] {"ISO_8859-6:1987", "iso-ir-127", "ISO_8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", "8859_6"}), 829 new Charset("ISO8859_7", "ISO-8859-7", 830 new String[] {"ISO_8859-7:1987", "iso-ir-126", "ISO_8859-7", 831 "ELOT_928", "ECMA-118", "greek", "greek8", 832 "csISOLatinGreek", "8859_7", "sun_eu_greek"}), 833 new Charset("ISO8859_8", "ISO-8859-8", new String[] {"ISO_8859-8:1988", "iso-ir-138", "ISO_8859-8", "hebrew", "csISOLatinHebrew", "8859_8"}), 834 new Charset("ISO8859_9", "ISO-8859-9", 835 new String[] {"ISO_8859-9:1989", "iso-ir-148", "ISO_8859-9", 836 "latin5", "l5", "csISOLatin5", "8859_9"}), 837 838 new Charset("ISO8859_13", "ISO-8859-13", new String[] {}), 839 new Charset("ISO8859_15", "ISO-8859-15", 840 new String[] {"ISO_8859-15", "Latin-9", "8859_15", 841 "csISOlatin9", "IBM923", "cp923", "923", "L9", 842 "IBM-923", "ISO8859-15", "LATIN9", "LATIN0", 843 "csISOlatin0", "ISO8859_15_FDIS"}), 844 new Charset("KOI8_R", "KOI8-R", new String[] {"csKOI8R", "koi8"}), 845 new Charset("ASCII", "US-ASCII", 846 new String[] {"ANSI_X3.4-1968", "iso-ir-6", 847 "ANSI_X3.4-1986", "ISO_646.irv:1991", 848 "ISO646-US", "us", "IBM367", "cp367", 849 "csASCII", "ascii7", "646", "iso_646.irv:1983"}), 850 new Charset("UTF8", "UTF-8", new String[] {}), 851 new Charset("UTF-16", "UTF-16", new String[] {"UTF_16"}), 852 new Charset("UnicodeBigUnmarked", "UTF-16BE", new String[] {"X-UTF-16BE", "UTF_16BE", "ISO-10646-UCS-2"}), 853 new Charset("UnicodeLittleUnmarked", "UTF-16LE", new String[] {"UTF_16LE", "X-UTF-16LE"}), 854 new Charset("Big5", "Big5", new String[] {"csBig5", "CN-Big5", "BIG-FIVE", "BIGFIVE"}), 855 new Charset("Big5_HKSCS", "Big5-HKSCS", new String[] {"big5hkscs"}), 856 new Charset("EUC_JP", "EUC-JP", 857 new String[] {"csEUCPkdFmtJapanese", 858 "Extended_UNIX_Code_Packed_Format_for_Japanese", 859 "eucjis", "x-eucjp", "eucjp", "x-euc-jp"}), 860 new Charset("EUC_KR", "EUC-KR", 861 new String[] {"csEUCKR", "ksc5601", "5601", "ksc5601_1987", 862 "ksc_5601", "ksc5601-1987", "ks_c_5601-1987", 863 "euckr"}), 864 new Charset("GB18030", "GB18030", new String[] {"gb18030-2000"}), 865 new Charset("EUC_CN", "GB2312", new String[] {"x-EUC-CN", "csGB2312", "euccn", "euc-cn", "gb2312-80", "gb2312-1980", "CN-GB", "CN-GB-ISOIR165"}), 866 new Charset("GBK", "windows-936", new String[] {"CP936", "MS936", "ms_936", "x-mswin-936"}), 867 868 new Charset("Cp037", "IBM037", new String[] {"ebcdic-cp-us", "ebcdic-cp-ca", "ebcdic-cp-wt", "ebcdic-cp-nl", "csIBM037"}), 869 new Charset("Cp273", "IBM273", new String[] {"csIBM273"}), 870 new Charset("Cp277", "IBM277", new String[] {"EBCDIC-CP-DK", "EBCDIC-CP-NO", "csIBM277"}), 871 new Charset("Cp278", "IBM278", new String[] {"CP278", "ebcdic-cp-fi", "ebcdic-cp-se", "csIBM278"}), 872 new Charset("Cp280", "IBM280", new String[] {"ebcdic-cp-it", "csIBM280"}), 873 new Charset("Cp284", "IBM284", new String[] {"ebcdic-cp-es", "csIBM284"}), 874 new Charset("Cp285", "IBM285", new String[] {"ebcdic-cp-gb", "csIBM285"}), 875 new Charset("Cp297", "IBM297", new String[] {"ebcdic-cp-fr", "csIBM297"}), 876 new Charset("Cp420", "IBM420", new String[] {"ebcdic-cp-ar1", "csIBM420"}), 877 new Charset("Cp424", "IBM424", new String[] {"ebcdic-cp-he", "csIBM424"}), 878 new Charset("Cp437", "IBM437", new String[] {"437", "csPC8CodePage437"}), 879 new Charset("Cp500", "IBM500", new String[] {"ebcdic-cp-be", "ebcdic-cp-ch", "csIBM500"}), 880 new Charset("Cp775", "IBM775", new String[] {"csPC775Baltic"}), 881 new Charset("Cp838", "IBM-Thai", new String[] {}), 882 new Charset("Cp850", "IBM850", new String[] {"850", "csPC850Multilingual"}), 883 new Charset("Cp852", "IBM852", new String[] {"852", "csPCp852"}), 884 new Charset("Cp855", "IBM855", new String[] {"855", "csIBM855"}), 885 new Charset("Cp857", "IBM857", new String[] {"857", "csIBM857"}), 886 new Charset("Cp858", "IBM00858", 887 new String[] {"CCSID00858", "CP00858", 888 "PC-Multilingual-850+euro"}), 889 new Charset("Cp860", "IBM860", new String[] {"860", "csIBM860"}), 890 new Charset("Cp861", "IBM861", new String[] {"861", "cp-is", "csIBM861"}), 891 new Charset("Cp862", "IBM862", new String[] {"862", "csPC862LatinHebrew"}), 892 new Charset("Cp863", "IBM863", new String[] {"863", "csIBM863"}), 893 new Charset("Cp864", "IBM864", new String[] {"cp864", "csIBM864"}), 894 new Charset("Cp865", "IBM865", new String[] {"865", "csIBM865"}), 895 new Charset("Cp866", "IBM866", new String[] {"866", "csIBM866"}), 896 new Charset("Cp868", "IBM868", new String[] {"cp-ar", "csIBM868"}), 897 new Charset("Cp869", "IBM869", new String[] {"cp-gr", "csIBM869"}), 898 new Charset("Cp870", "IBM870", new String[] {"ebcdic-cp-roece", "ebcdic-cp-yu", "csIBM870"}), 899 new Charset("Cp871", "IBM871", new String[] {"ebcdic-cp-is", "csIBM871"}), 900 new Charset("Cp918", "IBM918", new String[] {"ebcdic-cp-ar2", "csIBM918"}), 901 new Charset("Cp1026", "IBM1026", new String[] {"csIBM1026"}), 902 new Charset("Cp1047", "IBM1047", new String[] {"IBM-1047"}), 903 new Charset("Cp1140", "IBM01140", 904 new String[] {"CCSID01140", "CP01140", 905 "ebcdic-us-37+euro"}), 906 new Charset("Cp1141", "IBM01141", 907 new String[] {"CCSID01141", "CP01141", 908 "ebcdic-de-273+euro"}), 909 new Charset("Cp1142", "IBM01142", new String[] {"CCSID01142", "CP01142", "ebcdic-dk-277+euro", "ebcdic-no-277+euro"}), 910 new Charset("Cp1143", "IBM01143", new String[] {"CCSID01143", "CP01143", "ebcdic-fi-278+euro", "ebcdic-se-278+euro"}), 911 new Charset("Cp1144", "IBM01144", new String[] {"CCSID01144", "CP01144", "ebcdic-it-280+euro"}), 912 new Charset("Cp1145", "IBM01145", new String[] {"CCSID01145", "CP01145", "ebcdic-es-284+euro"}), 913 new Charset("Cp1146", "IBM01146", new String[] {"CCSID01146", "CP01146", "ebcdic-gb-285+euro"}), 914 new Charset("Cp1147", "IBM01147", new String[] {"CCSID01147", "CP01147", "ebcdic-fr-297+euro"}), 915 new Charset("Cp1148", "IBM01148", new String[] {"CCSID01148", "CP01148", "ebcdic-international-500+euro"}), 916 new Charset("Cp1149", "IBM01149", new String[] {"CCSID01149", "CP01149", "ebcdic-is-871+euro"}), 917 new Charset("Cp1250", "windows-1250", new String[] {}), 918 new Charset("Cp1251", "windows-1251", new String[] {}), 919 new Charset("Cp1252", "windows-1252", new String[] {}), 920 new Charset("Cp1253", "windows-1253", new String[] {}), 921 new Charset("Cp1254", "windows-1254", new String[] {}), 922 new Charset("Cp1255", "windows-1255", new String[] {}), 923 new Charset("Cp1256", "windows-1256", new String[] {}), 924 new Charset("Cp1257", "windows-1257", new String[] {}), 925 new Charset("Cp1258", "windows-1258", new String[] {}), 926 new Charset("ISO2022CN", "ISO-2022-CN", new String[] {}), 927 new Charset("ISO2022JP", "ISO-2022-JP", new String[] {"csISO2022JP", "JIS", "jis_encoding", "csjisencoding"}), 928 new Charset("ISO2022KR", "ISO-2022-KR", new String[] {"csISO2022KR"}), 929 new Charset("JIS_X0201", "JIS_X0201", new String[] {"X0201", "JIS0201", "csHalfWidthKatakana"}), 930 new Charset("JIS_X0212-1990", "JIS_X0212-1990", new String[] {"iso-ir-159", "x0212", "JIS0212", "csISO159JISX02121990"}), 931 new Charset("JIS_C6626-1983", "JIS_C6626-1983", new String[] {"x-JIS0208", "JIS0208", "csISO87JISX0208", "x0208", "JIS_X0208-1983", "iso-ir-87"}), 932 new Charset("SJIS", "Shift_JIS", new String[] {"MS_Kanji", "csShiftJIS", "shift-jis", "x-sjis", "pck"}), 933 new Charset("TIS620", "TIS-620", new String[] {}), 934 new Charset("MS932", "Windows-31J", new String[] {"windows-932", "csWindows31J", "x-ms-cp932"}), 935 new Charset("EUC_TW", "EUC-TW", new String[] {"x-EUC-TW", "cns11643", "euctw"}), 936 new Charset("x-Johab", "johab", new String[] {"johab", "cp1361", "ms1361", "ksc5601-1992", "ksc5601_1992"}), 937 new Charset("MS950_HKSCS", "", new String[] {}), 938 new Charset("MS874", "windows-874", new String[] {"cp874"}), 939 new Charset("MS949", "windows-949", new String[] {"windows949", "ms_949", "x-windows-949"}), 940 new Charset("MS950", "windows-950", new String[] {"x-windows-950"}), 941 942 new Charset("Cp737", null, new String[] {}), 943 new Charset("Cp856", null, new String[] {}), 944 new Charset("Cp875", null, new String[] {}), 945 new Charset("Cp921", null, new String[] {}), 946 new Charset("Cp922", null, new String[] {}), 947 new Charset("Cp930", null, new String[] {}), 948 new Charset("Cp933", null, new String[] {}), 949 new Charset("Cp935", null, new String[] {}), 950 new Charset("Cp937", null, new String[] {}), 951 new Charset("Cp939", null, new String[] {}), 952 new Charset("Cp942", null, new String[] {}), 953 new Charset("Cp942C", null, new String[] {}), 954 new Charset("Cp943", null, new String[] {}), 955 new Charset("Cp943C", null, new String[] {}), 956 new Charset("Cp948", null, new String[] {}), 957 new Charset("Cp949", null, new String[] {}), 958 new Charset("Cp949C", null, new String[] {}), 959 new Charset("Cp950", null, new String[] {}), 960 new Charset("Cp964", null, new String[] {}), 961 new Charset("Cp970", null, new String[] {}), 962 new Charset("Cp1006", null, new String[] {}), 963 new Charset("Cp1025", null, new String[] {}), 964 new Charset("Cp1046", null, new String[] {}), 965 new Charset("Cp1097", null, new String[] {}), 966 new Charset("Cp1098", null, new String[] {}), 967 new Charset("Cp1112", null, new String[] {}), 968 new Charset("Cp1122", null, new String[] {}), 969 new Charset("Cp1123", null, new String[] {}), 970 new Charset("Cp1124", null, new String[] {}), 971 new Charset("Cp1381", null, new String[] {}), 972 new Charset("Cp1383", null, new String[] {}), 973 new Charset("Cp33722", null, new String[] {}), 974 new Charset("Big5_Solaris", null, new String[] {}), 975 new Charset("EUC_JP_LINUX", null, new String[] {}), 976 new Charset("EUC_JP_Solaris", null, new String[] {}), 977 new Charset("ISCII91", null, new String[] {"x-ISCII91", "iscii"}), 978 new Charset("ISO2022_CN_CNS", null, new String[] {}), 979 new Charset("ISO2022_CN_GB", null, new String[] {}), 980 new Charset("x-iso-8859-11", null, new String[] {}), 981 new Charset("JISAutoDetect", null, new String[] {}), 982 new Charset("MacArabic", null, new String[] {}), 983 new Charset("MacCentralEurope", null, new String[] {}), 984 new Charset("MacCroatian", null, new String[] {}), 985 new Charset("MacCyrillic", null, new String[] {}), 986 new Charset("MacDingbat", null, new String[] {}), 987 new Charset("MacGreek", "MacGreek", new String[] {}), 988 new Charset("MacHebrew", null, new String[] {}), 989 new Charset("MacIceland", null, new String[] {}), 990 new Charset("MacRoman", "MacRoman", new String[] {"Macintosh", "MAC", "csMacintosh"}), 991 new Charset("MacRomania", null, new String[] {}), 992 new Charset("MacSymbol", null, new String[] {}), 993 new Charset("MacThai", null, new String[] {}), 994 new Charset("MacTurkish", null, new String[] {}), 995 new Charset("MacUkraine", null, new String[] {}), 996 new Charset("UnicodeBig", null, new String[] {}), 997 new Charset("UnicodeLittle", null, new String[] {}) 998 }; 999 1000 /** 1001 * Contains the canonical names of character sets which can be used to 1002 * decode bytes into Java chars. 1003 */ 1004 private static TreeSet decodingSupported = null; 1005 1006 /** 1007 * Contains the canonical names of character sets which can be used to 1008 * encode Java chars into bytes. 1009 */ 1010 private static TreeSet encodingSupported = null; 1011 1012 /** 1013 * Maps character set names to Charset objects. All possible names of 1014 * a charset will be mapped to the Charset. 1015 */ 1016 private static HashMap charsetMap = null; 1017 1018 static { 1019 decodingSupported = new TreeSet(); 1020 encodingSupported = new TreeSet(); 1021 byte[] dummy = new byte[] {'d', 'u', 'm', 'm', 'y'}; 1022 for (int i = 0; i < JAVA_CHARSETS.length; i++) { 1023 try { 1024 String s = new String(dummy, JAVA_CHARSETS[i].canonical); 1025 decodingSupported.add(JAVA_CHARSETS[i].canonical.toLowerCase()); 1026 } catch (UnsupportedOperationException e) { 1027 } catch (UnsupportedEncodingException e) { 1028 } 1029 try { 1030 "dummy".getBytes(JAVA_CHARSETS[i].canonical); 1031 encodingSupported.add(JAVA_CHARSETS[i].canonical.toLowerCase()); 1032 } catch (UnsupportedOperationException e) { 1033 } catch (UnsupportedEncodingException e) { 1034 } 1035 } 1036 1037 charsetMap = new HashMap(); 1038 for (int i = 0; i < JAVA_CHARSETS.length; i++) { 1039 Charset c = JAVA_CHARSETS[i]; 1040 charsetMap.put(c.canonical.toLowerCase(), c); 1041 if (c.mime != null) { 1042 charsetMap.put(c.mime.toLowerCase(), c); 1043 } 1044 if (c.aliases != null) { 1045 for (int j = 0; j < c.aliases.length; j++) { 1046 charsetMap.put(c.aliases[j].toLowerCase(), c); 1047 } 1048 } 1049 } 1050 1051 if (log.isDebugEnabled()) { 1052 log.debug("Character sets which support decoding: " 1053 + decodingSupported); 1054 log.debug("Character sets which support encoding: " 1055 + encodingSupported); 1056 } 1057 } 1058 1059 /** 1060 * ANDROID: THE FOLLOWING SET OF STATIC STRINGS ARE COPIED FROM A NEWER VERSION OF MIME4J 1061 */ 1062 1063 /** carriage return - line feed sequence */ 1064 public static final String CRLF = "\r\n"; 1065 1066 /** US-ASCII CR, carriage return (13) */ 1067 public static final int CR = '\r'; 1068 1069 /** US-ASCII LF, line feed (10) */ 1070 public static final int LF = '\n'; 1071 1072 /** US-ASCII SP, space (32) */ 1073 public static final int SP = ' '; 1074 1075 /** US-ASCII HT, horizontal-tab (9)*/ 1076 public static final int HT = '\t'; 1077 1078 public static final java.nio.charset.Charset US_ASCII = java.nio.charset.Charset 1079 .forName("US-ASCII"); 1080 1081 public static final java.nio.charset.Charset ISO_8859_1 = java.nio.charset.Charset 1082 .forName("ISO-8859-1"); 1083 1084 public static final java.nio.charset.Charset UTF_8 = java.nio.charset.Charset 1085 .forName("UTF-8"); 1086 1087 /** 1088 * Returns <code>true</code> if the specified character is a whitespace 1089 * character (CR, LF, SP or HT). 1090 * 1091 * ANDROID: COPIED FROM A NEWER VERSION OF MIME4J 1092 * 1093 * @param ch 1094 * character to test. 1095 * @return <code>true</code> if the specified character is a whitespace 1096 * character, <code>false</code> otherwise. 1097 */ 1098 public static boolean isWhitespace(char ch) { 1099 return ch == SP || ch == HT || ch == CR || ch == LF; 1100 } 1101 1102 /** 1103 * Returns <code>true</code> if the specified string consists entirely of 1104 * whitespace characters. 1105 * 1106 * ANDROID: COPIED FROM A NEWER VERSION OF MIME4J 1107 * 1108 * @param s 1109 * string to test. 1110 * @return <code>true</code> if the specified string consists entirely of 1111 * whitespace characters, <code>false</code> otherwise. 1112 */ 1113 public static boolean isWhitespace(final String s) { 1114 if (s == null) { 1115 throw new IllegalArgumentException("String may not be null"); 1116 } 1117 final int len = s.length(); 1118 for (int i = 0; i < len; i++) { 1119 if (!isWhitespace(s.charAt(i))) { 1120 return false; 1121 } 1122 } 1123 return true; 1124 } 1125 1126 /** 1127 * Determines if the VM supports encoding (chars to bytes) the 1128 * specified character set. NOTE: the given character set name may 1129 * not be known to the VM even if this method returns <code>true</code>. 1130 * Use {@link #toJavaCharset(String)} to get the canonical Java character 1131 * set name. 1132 * 1133 * @param charsetName the characters set name. 1134 * @return <code>true</code> if encoding is supported, <code>false</code> 1135 * otherwise. 1136 */ 1137 public static boolean isEncodingSupported(String charsetName) { 1138 return encodingSupported.contains(charsetName.toLowerCase()); 1139 } 1140 1141 /** 1142 * Determines if the VM supports decoding (bytes to chars) the 1143 * specified character set. NOTE: the given character set name may 1144 * not be known to the VM even if this method returns <code>true</code>. 1145 * Use {@link #toJavaCharset(String)} to get the canonical Java character 1146 * set name. 1147 * 1148 * @param charsetName the characters set name. 1149 * @return <code>true</code> if decoding is supported, <code>false</code> 1150 * otherwise. 1151 */ 1152 public static boolean isDecodingSupported(String charsetName) { 1153 return decodingSupported.contains(charsetName.toLowerCase()); 1154 } 1155 1156 /** 1157 * Gets the preferred MIME character set name for the specified 1158 * character set or <code>null</code> if not known. 1159 * 1160 * @param charsetName the character set name to look for. 1161 * @return the MIME preferred name or <code>null</code> if not known. 1162 */ 1163 public static String toMimeCharset(String charsetName) { 1164 Charset c = (Charset) charsetMap.get(charsetName.toLowerCase()); 1165 if (c != null) { 1166 return c.mime; 1167 } 1168 return null; 1169 } 1170 1171 /** 1172 * Gets the canonical Java character set name for the specified 1173 * character set or <code>null</code> if not known. This should be 1174 * called before doing any conversions using the Java API. NOTE: 1175 * you must use {@link #isEncodingSupported(String)} or 1176 * {@link #isDecodingSupported(String)} to make sure the returned 1177 * Java character set is supported by the current VM. 1178 * 1179 * @param charsetName the character set name to look for. 1180 * @return the canonical Java name or <code>null</code> if not known. 1181 */ 1182 public static String toJavaCharset(String charsetName) { 1183 Charset c = (Charset) charsetMap.get(charsetName.toLowerCase()); 1184 if (c != null) { 1185 return c.canonical; 1186 } 1187 return null; 1188 } 1189 1190 public static java.nio.charset.Charset getCharset(String charsetName) { 1191 String defaultCharset = "ISO-8859-1"; 1192 1193 // Use the default chareset if given charset is null 1194 if(charsetName == null) charsetName = defaultCharset; 1195 1196 try { 1197 return java.nio.charset.Charset.forName(charsetName); 1198 } catch (IllegalCharsetNameException e) { 1199 log.info("Illegal charset " + charsetName + ", fallback to " + defaultCharset + ": " + e); 1200 // Use default charset on exception 1201 return java.nio.charset.Charset.forName(defaultCharset); 1202 } catch (UnsupportedCharsetException ex) { 1203 log.info("Unsupported charset " + charsetName + ", fallback to " + defaultCharset + ": " + ex); 1204 // Use default charset on exception 1205 return java.nio.charset.Charset.forName(defaultCharset); 1206 } 1207 1208 } 1209 /* 1210 * Uncomment the code below and run the main method to regenerate the 1211 * Javadoc table above when the known charsets change. 1212 */ 1213 1214 /* 1215 private static String dumpHtmlTable() { 1216 LinkedList l = new LinkedList(Arrays.asList(JAVA_CHARSETS)); 1217 Collections.sort(l); 1218 StringBuffer sb = new StringBuffer(); 1219 sb.append(" * <table>\n"); 1220 sb.append(" * <tr>\n"); 1221 sb.append(" * <td>Canonical (Java) name</td>\n"); 1222 sb.append(" * <td>MIME preferred</td>\n"); 1223 sb.append(" * <td>Aliases</td>\n"); 1224 sb.append(" * </tr>\n"); 1225 1226 for (Iterator it = l.iterator(); it.hasNext();) { 1227 Charset c = (Charset) it.next(); 1228 sb.append(" * <tr>\n"); 1229 sb.append(" * <td>" + c.canonical + "</td>\n"); 1230 sb.append(" * <td>" + (c.mime == null ? "?" : c.mime)+ "</td>\n"); 1231 sb.append(" * <td>"); 1232 for (int i = 0; c.aliases != null && i < c.aliases.length; i++) { 1233 sb.append(c.aliases[i] + " "); 1234 } 1235 sb.append("</td>\n"); 1236 sb.append(" * </tr>\n"); 1237 } 1238 sb.append(" * </table>\n"); 1239 return sb.toString(); 1240 } 1241 1242 public static void main(String[] args) { 1243 System.out.println(dumpHtmlTable()); 1244 }*/ 1245} 1246