DecoderUtil.java revision 8978aac1977408b05e386ae846c30920c7faa0a6
1/**************************************************************** 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 ****************************************************************/ 19 20package org.apache.james.mime4j.decoder; 21 22import java.io.ByteArrayInputStream; 23import java.io.ByteArrayOutputStream; 24import java.io.IOException; 25import java.io.UnsupportedEncodingException; 26 27import org.apache.commons.logging.Log; 28import org.apache.commons.logging.LogFactory; 29import org.apache.james.mime4j.util.CharsetUtil; 30 31/** 32 * Static methods for decoding strings, byte arrays and encoded words. 33 * 34 * 35 * @version $Id: DecoderUtil.java,v 1.3 2005/02/07 15:33:59 ntherning Exp $ 36 */ 37public class DecoderUtil { 38 private static Log log = LogFactory.getLog(DecoderUtil.class); 39 40 /** 41 * Decodes a string containing quoted-printable encoded data. 42 * 43 * @param s the string to decode. 44 * @return the decoded bytes. 45 */ 46 public static byte[] decodeBaseQuotedPrintable(String s) { 47 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 48 49 try { 50 byte[] bytes = s.getBytes("US-ASCII"); 51 52 QuotedPrintableInputStream is = new QuotedPrintableInputStream( 53 new ByteArrayInputStream(bytes)); 54 55 int b = 0; 56 while ((b = is.read()) != -1) { 57 baos.write(b); 58 } 59 } catch (IOException e) { 60 /* 61 * This should never happen! 62 */ 63 log.error(e); 64 } 65 66 return baos.toByteArray(); 67 } 68 69 /** 70 * Decodes a string containing base64 encoded data. 71 * 72 * @param s the string to decode. 73 * @return the decoded bytes. 74 */ 75 public static byte[] decodeBase64(String s) { 76 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 77 78 try { 79 byte[] bytes = s.getBytes("US-ASCII"); 80 81 Base64InputStream is = new Base64InputStream( 82 new ByteArrayInputStream(bytes)); 83 84 int b = 0; 85 while ((b = is.read()) != -1) { 86 baos.write(b); 87 } 88 } catch (IOException e) { 89 /* 90 * This should never happen! 91 */ 92 log.error(e); 93 } 94 95 return baos.toByteArray(); 96 } 97 98 /** 99 * Decodes an encoded word encoded with the 'B' encoding (described in 100 * RFC 2047) found in a header field body. 101 * 102 * @param encodedWord the encoded word to decode. 103 * @param charset the Java charset to use. 104 * @return the decoded string. 105 * @throws UnsupportedEncodingException if the given Java charset isn't 106 * supported. 107 */ 108 public static String decodeB(String encodedWord, String charset) 109 throws UnsupportedEncodingException { 110 111 return new String(decodeBase64(encodedWord), charset); 112 } 113 114 /** 115 * Decodes an encoded word encoded with the 'Q' encoding (described in 116 * RFC 2047) found in a header field body. 117 * 118 * @param encodedWord the encoded word to decode. 119 * @param charset the Java charset to use. 120 * @return the decoded string. 121 * @throws UnsupportedEncodingException if the given Java charset isn't 122 * supported. 123 */ 124 public static String decodeQ(String encodedWord, String charset) 125 throws UnsupportedEncodingException { 126 127 /* 128 * Replace _ with =20 129 */ 130 StringBuffer sb = new StringBuffer(); 131 for (int i = 0; i < encodedWord.length(); i++) { 132 char c = encodedWord.charAt(i); 133 if (c == '_') { 134 sb.append("=20"); 135 } else { 136 sb.append(c); 137 } 138 } 139 140 return new String(decodeBaseQuotedPrintable(sb.toString()), charset); 141 } 142 143 /** 144 * Decodes a string containing encoded words as defined by RFC 2047. 145 * Encoded words in have the form 146 * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for 147 * quoted-printable and 'B' or 'b' for Base64. 148 * 149 * @param body the string to decode. 150 * @return the decoded string. 151 */ 152 public static String decodeEncodedWords(String body) { 153 StringBuffer sb = new StringBuffer(); 154 155 int p1 = 0; 156 int p2 = 0; 157 158 try { 159 160 /* 161 * Encoded words in headers have the form 162 * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for 163 * quoted printable and 'B' and 'b' for Base64 164 */ 165 166 while (p2 < body.length()) { 167 /* 168 * Find beginning of first encoded word 169 */ 170 p1 = body.indexOf("=?", p2); 171 if (p1 == -1) { 172 /* 173 * None found. Emit the rest of the header and exit. 174 */ 175 sb.append(body.substring(p2)); 176 break; 177 } 178 179 /* 180 * p2 points to the previously found end marker or the start 181 * of the entire header text. Append the text between that 182 * marker and the one pointed to by p1. 183 */ 184 if (p1 - p2 > 0) { 185 sb.append(body.substring(p2, p1)); 186 } 187 188 /* 189 * Find the first and second '?':s after the marker pointed to 190 * by p1. 191 */ 192 int t1 = body.indexOf('?', p1 + 2); 193 int t2 = t1 != -1 ? body.indexOf('?', t1 + 1) : -1; 194 195 /* 196 * Find this words end marker. 197 */ 198 p2 = t2 != -1 ? body.indexOf("?=", t2 + 1) : -1; 199 if (p2 == -1) { 200 if (t2 != -1 && (body.length() - 1 == t2 || body.charAt(t2 + 1) == '=')) { 201 /* 202 * Treat "=?charset?enc?" and "=?charset?enc?=" as 203 * empty strings. 204 */ 205 p2 = t2; 206 } else { 207 /* 208 * No end marker was found. Append the rest of the 209 * header and exit. 210 */ 211 sb.append(body.substring(p1)); 212 break; 213 } 214 } 215 216 /* 217 * [p1+2, t1] -> charset 218 * [t1+1, t2] -> encoding 219 * [t2+1, p2] -> encoded word 220 */ 221 222 String decodedWord = null; 223 if (t2 == p2) { 224 /* 225 * The text is empty 226 */ 227 decodedWord = ""; 228 } else { 229 230 String mimeCharset = body.substring(p1 + 2, t1); 231 String enc = body.substring(t1 + 1, t2); 232 String encodedWord = body.substring(t2 + 1, p2); 233 234 /* 235 * Convert the MIME charset to a corresponding Java one. 236 */ 237 String charset = CharsetUtil.toJavaCharset(mimeCharset); 238 if (charset == null) { 239 decodedWord = body.substring(p1, p2 + 2); 240 if (log.isWarnEnabled()) { 241 log.warn("MIME charset '" + mimeCharset 242 + "' in header field doesn't have a " 243 +"corresponding Java charset"); 244 } 245 } else if (!CharsetUtil.isDecodingSupported(charset)) { 246 decodedWord = body.substring(p1, p2 + 2); 247 if (log.isWarnEnabled()) { 248 log.warn("Current JDK doesn't support decoding " 249 + "of charset '" + charset 250 + "' (MIME charset '" 251 + mimeCharset + "')"); 252 } 253 } else { 254 if (enc.equalsIgnoreCase("Q")) { 255 decodedWord = DecoderUtil.decodeQ(encodedWord, charset); 256 } else if (enc.equalsIgnoreCase("B")) { 257 decodedWord = DecoderUtil.decodeB(encodedWord, charset); 258 } else { 259 decodedWord = encodedWord; 260 if (log.isWarnEnabled()) { 261 log.warn("Warning: Unknown encoding in " 262 + "header field '" + enc + "'"); 263 } 264 } 265 } 266 } 267 p2 += 2; 268 sb.append(decodedWord); 269 } 270 } catch (Throwable t) { 271 log.error("Decoding header field body '" + body + "'", t); 272 } 273 274 return sb.toString(); 275 } 276} 277