1/* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18/* 19 * $Id: Encodings.java 471981 2006-11-07 04:28:00Z minchau $ 20 */ 21package org.apache.xml.serializer; 22 23import java.io.InputStream; 24import java.io.OutputStream; 25import java.io.OutputStreamWriter; 26import java.io.UnsupportedEncodingException; 27import java.io.Writer; 28import java.util.ArrayList; 29import java.util.Enumeration; 30import java.util.Hashtable; 31import java.util.List; 32import java.util.Properties; 33import java.util.StringTokenizer; 34 35 36/** 37 * Provides information about encodings. Depends on the Java runtime 38 * to provides writers for the different encodings. 39 * <p> 40 * This class is not a public API. It is only public because it 41 * is used outside of this package. 42 * 43 * @xsl.usage internal 44 */ 45 46public final class Encodings extends Object 47{ 48 /** 49 * Standard filename for properties file with encodings data. 50 */ 51 private static final String ENCODINGS_FILE = SerializerBase.PKG_PATH+"/Encodings.properties"; 52 53 /** 54 * Returns a writer for the specified encoding based on 55 * an output stream. 56 * <p> 57 * This is not a public API. 58 * @param output The output stream 59 * @param encoding The encoding MIME name, not a Java name for the encoding. 60 * @return A suitable writer 61 * @throws UnsupportedEncodingException There is no convertor 62 * to support this encoding 63 * @xsl.usage internal 64 */ 65 static Writer getWriter(OutputStream output, String encoding) 66 throws UnsupportedEncodingException 67 { 68 69 for (int i = 0; i < _encodings.length; ++i) 70 { 71 if (_encodings[i].name.equalsIgnoreCase(encoding)) 72 { 73 try 74 { 75 String javaName = _encodings[i].javaName; 76 OutputStreamWriter osw = new OutputStreamWriter(output,javaName); 77 return osw; 78 } 79 catch (java.lang.IllegalArgumentException iae) // java 1.1.8 80 { 81 // keep trying 82 } 83 catch (UnsupportedEncodingException usee) 84 { 85 86 // keep trying 87 } 88 } 89 } 90 91 try 92 { 93 return new OutputStreamWriter(output, encoding); 94 } 95 catch (java.lang.IllegalArgumentException iae) // java 1.1.8 96 { 97 throw new UnsupportedEncodingException(encoding); 98 } 99 } 100 101 /** 102 * Returns the EncodingInfo object for the specified 103 * encoding, never null, although the encoding name 104 * inside the returned EncodingInfo object will be if 105 * we can't find a "real" EncodingInfo for the encoding. 106 * <p> 107 * This is not a public API. 108 * 109 * @param encoding The encoding 110 * @return The object that is used to determine if 111 * characters are in the given encoding. 112 * @xsl.usage internal 113 */ 114 static EncodingInfo getEncodingInfo(String encoding) 115 { 116 EncodingInfo ei; 117 118 String normalizedEncoding = toUpperCaseFast(encoding); 119 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); 120 if (ei == null) 121 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); 122 if (ei == null) { 123 // We shouldn't have to do this, but just in case. 124 ei = new EncodingInfo(null,null, '\u0000'); 125 } 126 127 return ei; 128 } 129 130 /** 131 * Determines if the encoding specified was recognized by the 132 * serializer or not. 133 * 134 * @param encoding The encoding 135 * @return boolean - true if the encoding was recognized else false 136 */ 137 public static boolean isRecognizedEncoding(String encoding) 138 { 139 EncodingInfo ei; 140 141 String normalizedEncoding = encoding.toUpperCase(); 142 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); 143 if (ei == null) 144 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); 145 if (ei != null) 146 return true; 147 return false; 148 } 149 150 /** 151 * A fast and cheap way to uppercase a String that is 152 * only made of printable ASCII characters. 153 * <p> 154 * This is not a public API. 155 * @param s a String of ASCII characters 156 * @return an uppercased version of the input String, 157 * possibly the same String. 158 * @xsl.usage internal 159 */ 160 static private String toUpperCaseFast(final String s) { 161 162 boolean different = false; 163 final int mx = s.length(); 164 char[] chars = new char[mx]; 165 for (int i=0; i < mx; i++) { 166 char ch = s.charAt(i); 167 // is the character a lower case ASCII one? 168 if ('a' <= ch && ch <= 'z') { 169 // a cheap and fast way to uppercase that is good enough 170 ch = (char) (ch + ('A' - 'a')); 171 different = true; // the uppercased String is different 172 } 173 chars[i] = ch; 174 } 175 176 // A little optimization, don't call String.valueOf() if 177 // the uppercased string is the same as the input string. 178 final String upper; 179 if (different) 180 upper = String.valueOf(chars); 181 else 182 upper = s; 183 184 return upper; 185 } 186 187 /** The default encoding, ISO style, ISO style. */ 188 static final String DEFAULT_MIME_ENCODING = "UTF-8"; 189 190 /** 191 * Get the proper mime encoding. From the XSLT recommendation: "The encoding 192 * attribute specifies the preferred encoding to use for outputting the result 193 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16. 194 * For other values, if the XSLT processor does not support the specified 195 * encoding it may signal an error; if it does not signal an error it should 196 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding 197 * whose name does not match the EncName production of the XML Recommendation 198 * [XML]. If no encoding attribute is specified, then the XSLT processor should 199 * use either UTF-8 or UTF-16." 200 * <p> 201 * This is not a public API. 202 * 203 * @param encoding Reference to java-style encoding string, which may be null, 204 * in which case a default will be found. 205 * 206 * @return The ISO-style encoding string, or null if failure. 207 * @xsl.usage internal 208 */ 209 static String getMimeEncoding(String encoding) 210 { 211 212 if (null == encoding) 213 { 214 try 215 { 216 217 // Get the default system character encoding. This may be 218 // incorrect if they passed in a writer, but right now there 219 // seems to be no way to get the encoding from a writer. 220 encoding = System.getProperty("file.encoding", "UTF8"); 221 222 if (null != encoding) 223 { 224 225 /* 226 * See if the mime type is equal to UTF8. If you don't 227 * do that, then convertJava2MimeEncoding will convert 228 * 8859_1 to "ISO-8859-1", which is not what we want, 229 * I think, and I don't think I want to alter the tables 230 * to convert everything to UTF-8. 231 */ 232 String jencoding = 233 (encoding.equalsIgnoreCase("Cp1252") 234 || encoding.equalsIgnoreCase("ISO8859_1") 235 || encoding.equalsIgnoreCase("8859_1") 236 || encoding.equalsIgnoreCase("UTF8")) 237 ? DEFAULT_MIME_ENCODING 238 : convertJava2MimeEncoding(encoding); 239 240 encoding = 241 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING; 242 } 243 else 244 { 245 encoding = DEFAULT_MIME_ENCODING; 246 } 247 } 248 catch (SecurityException se) 249 { 250 encoding = DEFAULT_MIME_ENCODING; 251 } 252 } 253 else 254 { 255 encoding = convertJava2MimeEncoding(encoding); 256 } 257 258 return encoding; 259 } 260 261 /** 262 * Try the best we can to convert a Java encoding to a XML-style encoding. 263 * <p> 264 * This is not a public API. 265 * @param encoding non-null reference to encoding string, java style. 266 * 267 * @return ISO-style encoding string. 268 * @xsl.usage internal 269 */ 270 private static String convertJava2MimeEncoding(String encoding) 271 { 272 EncodingInfo enc = 273 (EncodingInfo) _encodingTableKeyJava.get(toUpperCaseFast(encoding)); 274 if (null != enc) 275 return enc.name; 276 return encoding; 277 } 278 279 /** 280 * Try the best we can to convert a Java encoding to a XML-style encoding. 281 * <p> 282 * This is not a public API. 283 * 284 * @param encoding non-null reference to encoding string, java style. 285 * 286 * @return ISO-style encoding string. 287 * <p> 288 * This method is not a public API. 289 * @xsl.usage internal 290 */ 291 public static String convertMime2JavaEncoding(String encoding) 292 { 293 294 for (int i = 0; i < _encodings.length; ++i) 295 { 296 if (_encodings[i].name.equalsIgnoreCase(encoding)) 297 { 298 return _encodings[i].javaName; 299 } 300 } 301 302 return encoding; 303 } 304 305 /** 306 * Load a list of all the supported encodings. 307 * 308 * System property "encodings" formatted using URL syntax may define an 309 * external encodings list. Thanks to Sergey Ushakov for the code 310 * contribution! 311 * @xsl.usage internal 312 */ 313 private static EncodingInfo[] loadEncodingInfo() 314 { 315 try 316 { 317 final InputStream is; 318 319 SecuritySupport ss = SecuritySupport.getInstance(); 320 is = ss.getResourceAsStream(ObjectFactory.findClassLoader(), 321 ENCODINGS_FILE); 322 323 Properties props = new Properties(); 324 if (is != null) { 325 props.load(is); 326 is.close(); 327 } else { 328 // Seems to be no real need to force failure here, let the 329 // system do its best... The issue is not really very critical, 330 // and the output will be in any case _correct_ though maybe not 331 // always human-friendly... :) 332 // But maybe report/log the resource problem? 333 // Any standard ways to report/log errors (in static context)? 334 } 335 336 int totalEntries = props.size(); 337 338 List encodingInfo_list = new ArrayList(); 339 Enumeration keys = props.keys(); 340 for (int i = 0; i < totalEntries; ++i) 341 { 342 String javaName = (String) keys.nextElement(); 343 String val = props.getProperty(javaName); 344 int len = lengthOfMimeNames(val); 345 346 String mimeName; 347 char highChar; 348 if (len == 0) 349 { 350 // There is no property value, only the javaName, so try and recover 351 mimeName = javaName; 352 highChar = '\u0000'; // don't know the high code point, will need to test every character 353 } 354 else 355 { 356 try { 357 // Get the substring after the Mime names 358 final String highVal = val.substring(len).trim(); 359 highChar = (char) Integer.decode(highVal).intValue(); 360 } 361 catch( NumberFormatException e) { 362 highChar = 0; 363 } 364 String mimeNames = val.substring(0, len); 365 StringTokenizer st = 366 new StringTokenizer(mimeNames, ","); 367 for (boolean first = true; 368 st.hasMoreTokens(); 369 first = false) 370 { 371 mimeName = st.nextToken(); 372 EncodingInfo ei = new EncodingInfo(mimeName, javaName, highChar); 373 encodingInfo_list.add(ei); 374 _encodingTableKeyMime.put(mimeName.toUpperCase(), ei); 375 if (first) 376 _encodingTableKeyJava.put(javaName.toUpperCase(), ei); 377 } 378 } 379 } 380 // Convert the Vector of EncodingInfo objects into an array of them, 381 // as that is the kind of thing this method returns. 382 EncodingInfo[] ret_ei = new EncodingInfo[encodingInfo_list.size()]; 383 encodingInfo_list.toArray(ret_ei); 384 return ret_ei; 385 } 386 catch (java.net.MalformedURLException mue) 387 { 388 throw new org.apache.xml.serializer.utils.WrappedRuntimeException(mue); 389 } 390 catch (java.io.IOException ioe) 391 { 392 throw new org.apache.xml.serializer.utils.WrappedRuntimeException(ioe); 393 } 394 } 395 396 /** 397 * Get the length of the Mime names within the property value 398 * @param val The value of the property, which should contain a comma 399 * separated list of Mime names, followed optionally by a space and the 400 * high char value 401 * @return 402 */ 403 private static int lengthOfMimeNames(String val) { 404 // look for the space preceding the optional high char 405 int len = val.indexOf(' '); 406 // If len is zero it means the optional part is not there, so 407 // the value must be all Mime names, so set the length appropriately 408 if (len < 0) 409 len = val.length(); 410 411 return len; 412 } 413 414 /** 415 * Return true if the character is the high member of a surrogate pair. 416 * <p> 417 * This is not a public API. 418 * @param ch the character to test 419 * @xsl.usage internal 420 */ 421 static boolean isHighUTF16Surrogate(char ch) { 422 return ('\uD800' <= ch && ch <= '\uDBFF'); 423 } 424 /** 425 * Return true if the character is the low member of a surrogate pair. 426 * <p> 427 * This is not a public API. 428 * @param ch the character to test 429 * @xsl.usage internal 430 */ 431 static boolean isLowUTF16Surrogate(char ch) { 432 return ('\uDC00' <= ch && ch <= '\uDFFF'); 433 } 434 /** 435 * Return the unicode code point represented by the high/low surrogate pair. 436 * <p> 437 * This is not a public API. 438 * @param highSurrogate the high char of the high/low pair 439 * @param lowSurrogate the low char of the high/low pair 440 * @xsl.usage internal 441 */ 442 static int toCodePoint(char highSurrogate, char lowSurrogate) { 443 int codePoint = 444 ((highSurrogate - 0xd800) << 10) 445 + (lowSurrogate - 0xdc00) 446 + 0x10000; 447 return codePoint; 448 } 449 /** 450 * Return the unicode code point represented by the char. 451 * A bit of a dummy method, since all it does is return the char, 452 * but as an int value. 453 * <p> 454 * This is not a public API. 455 * @param ch the char. 456 * @xsl.usage internal 457 */ 458 static int toCodePoint(char ch) { 459 int codePoint = ch; 460 return codePoint; 461 } 462 463 /** 464 * Characters with values at or below the high code point are 465 * in the encoding. Code point values above this one may or may 466 * not be in the encoding, but lower ones certainly are. 467 * <p> 468 * This is for performance. 469 * 470 * @param encoding The encoding 471 * @return The code point for which characters at or below this code point 472 * are in the encoding. Characters with higher code point may or may not be 473 * in the encoding. A value of zero is returned if the high code point is unknown. 474 * <p> 475 * This method is not a public API. 476 * @xsl.usage internal 477 */ 478 static public char getHighChar(String encoding) 479 { 480 final char highCodePoint; 481 EncodingInfo ei; 482 483 String normalizedEncoding = toUpperCaseFast(encoding); 484 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); 485 if (ei == null) 486 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); 487 if (ei != null) 488 highCodePoint = ei.getHighChar(); 489 else 490 highCodePoint = 0; 491 return highCodePoint; 492 } 493 494 private static final Hashtable _encodingTableKeyJava = new Hashtable(); 495 private static final Hashtable _encodingTableKeyMime = new Hashtable(); 496 private static final EncodingInfo[] _encodings = loadEncodingInfo(); 497} 498