DecoderUtil.java revision 8978aac1977408b05e386ae846c30920c7faa0a6
1/****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one   *
3 * or more contributor license agreements.  See the NOTICE file *
4 * distributed with this work for additional information        *
5 * regarding copyright ownership.  The ASF licenses this file   *
6 * to you under the Apache License, Version 2.0 (the            *
7 * "License"); you may not use this file except in compliance   *
8 * with the License.  You may obtain a copy of the License at   *
9 *                                                              *
10 *   http://www.apache.org/licenses/LICENSE-2.0                 *
11 *                                                              *
12 * Unless required by applicable law or agreed to in writing,   *
13 * software distributed under the License is distributed on an  *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15 * KIND, either express or implied.  See the License for the    *
16 * specific language governing permissions and limitations      *
17 * under the License.                                           *
18 ****************************************************************/
19
20package org.apache.james.mime4j.decoder;
21
22import java.io.ByteArrayInputStream;
23import java.io.ByteArrayOutputStream;
24import java.io.IOException;
25import java.io.UnsupportedEncodingException;
26
27import org.apache.commons.logging.Log;
28import org.apache.commons.logging.LogFactory;
29import org.apache.james.mime4j.util.CharsetUtil;
30
31/**
32 * Static methods for decoding strings, byte arrays and encoded words.
33 *
34 *
35 * @version $Id: DecoderUtil.java,v 1.3 2005/02/07 15:33:59 ntherning Exp $
36 */
37public class DecoderUtil {
38    private static Log log = LogFactory.getLog(DecoderUtil.class);
39
40    /**
41     * Decodes a string containing quoted-printable encoded data.
42     *
43     * @param s the string to decode.
44     * @return the decoded bytes.
45     */
46    public static byte[] decodeBaseQuotedPrintable(String s) {
47        ByteArrayOutputStream baos = new ByteArrayOutputStream();
48
49        try {
50            byte[] bytes = s.getBytes("US-ASCII");
51
52            QuotedPrintableInputStream is = new QuotedPrintableInputStream(
53                                               new ByteArrayInputStream(bytes));
54
55            int b = 0;
56            while ((b = is.read()) != -1) {
57                baos.write(b);
58            }
59        } catch (IOException e) {
60            /*
61             * This should never happen!
62             */
63            log.error(e);
64        }
65
66        return baos.toByteArray();
67    }
68
69    /**
70     * Decodes a string containing base64 encoded data.
71     *
72     * @param s the string to decode.
73     * @return the decoded bytes.
74     */
75    public static byte[] decodeBase64(String s) {
76        ByteArrayOutputStream baos = new ByteArrayOutputStream();
77
78        try {
79            byte[] bytes = s.getBytes("US-ASCII");
80
81            Base64InputStream is = new Base64InputStream(
82                                        new ByteArrayInputStream(bytes));
83
84            int b = 0;
85            while ((b = is.read()) != -1) {
86                baos.write(b);
87            }
88        } catch (IOException e) {
89            /*
90             * This should never happen!
91             */
92            log.error(e);
93        }
94
95        return baos.toByteArray();
96    }
97
98    /**
99     * Decodes an encoded word encoded with the 'B' encoding (described in
100     * RFC 2047) found in a header field body.
101     *
102     * @param encodedWord the encoded word to decode.
103     * @param charset the Java charset to use.
104     * @return the decoded string.
105     * @throws UnsupportedEncodingException if the given Java charset isn't
106     *         supported.
107     */
108    public static String decodeB(String encodedWord, String charset)
109            throws UnsupportedEncodingException {
110
111        return new String(decodeBase64(encodedWord), charset);
112    }
113
114    /**
115     * Decodes an encoded word encoded with the 'Q' encoding (described in
116     * RFC 2047) found in a header field body.
117     *
118     * @param encodedWord the encoded word to decode.
119     * @param charset the Java charset to use.
120     * @return the decoded string.
121     * @throws UnsupportedEncodingException if the given Java charset isn't
122     *         supported.
123     */
124    public static String decodeQ(String encodedWord, String charset)
125            throws UnsupportedEncodingException {
126
127        /*
128         * Replace _ with =20
129         */
130        StringBuffer sb = new StringBuffer();
131        for (int i = 0; i < encodedWord.length(); i++) {
132            char c = encodedWord.charAt(i);
133            if (c == '_') {
134                sb.append("=20");
135            } else {
136                sb.append(c);
137            }
138        }
139
140        return new String(decodeBaseQuotedPrintable(sb.toString()), charset);
141    }
142
143    /**
144     * Decodes a string containing encoded words as defined by RFC 2047.
145     * Encoded words in have the form
146     * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for
147     * quoted-printable and 'B' or 'b' for Base64.
148     *
149     * @param body the string to decode.
150     * @return the decoded string.
151     */
152    public static String decodeEncodedWords(String body) {
153        StringBuffer sb = new StringBuffer();
154
155        int p1 = 0;
156        int p2 = 0;
157
158        try {
159
160            /*
161             * Encoded words in headers have the form
162             * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for
163             * quoted printable and 'B' and 'b' for Base64
164             */
165
166            while (p2 < body.length()) {
167                /*
168                 * Find beginning of first encoded word
169                 */
170                p1 = body.indexOf("=?", p2);
171                if (p1 == -1) {
172                    /*
173                     * None found. Emit the rest of the header and exit.
174                     */
175                    sb.append(body.substring(p2));
176                    break;
177                }
178
179                /*
180                 * p2 points to the previously found end marker or the start
181                 * of the entire header text. Append the text between that
182                 * marker and the one pointed to by p1.
183                 */
184                if (p1 - p2 > 0) {
185                    sb.append(body.substring(p2, p1));
186                }
187
188                /*
189                 * Find the first and second '?':s after the marker pointed to
190                 * by p1.
191                 */
192                int t1 = body.indexOf('?', p1 + 2);
193                int t2 = t1 != -1 ? body.indexOf('?', t1 + 1) : -1;
194
195                /*
196                 * Find this words end marker.
197                 */
198                p2 = t2 != -1 ? body.indexOf("?=", t2 + 1) : -1;
199                if (p2 == -1) {
200                    if (t2 != -1 && (body.length() - 1 == t2 || body.charAt(t2 + 1) == '=')) {
201                        /*
202                         * Treat "=?charset?enc?" and "=?charset?enc?=" as
203                         * empty strings.
204                         */
205                        p2 = t2;
206                    } else {
207                        /*
208                         * No end marker was found. Append the rest of the
209                         * header and exit.
210                         */
211                        sb.append(body.substring(p1));
212                        break;
213                    }
214                }
215
216                /*
217                 * [p1+2, t1] -> charset
218                 * [t1+1, t2] -> encoding
219                 * [t2+1, p2] -> encoded word
220                 */
221
222                String decodedWord = null;
223                if (t2 == p2) {
224                    /*
225                     * The text is empty
226                     */
227                    decodedWord = "";
228                } else {
229
230                    String mimeCharset = body.substring(p1 + 2, t1);
231                    String enc = body.substring(t1 + 1, t2);
232                    String encodedWord = body.substring(t2 + 1, p2);
233
234                    /*
235                     * Convert the MIME charset to a corresponding Java one.
236                     */
237                    String charset = CharsetUtil.toJavaCharset(mimeCharset);
238                    if (charset == null) {
239                        decodedWord = body.substring(p1, p2 + 2);
240                        if (log.isWarnEnabled()) {
241                            log.warn("MIME charset '" + mimeCharset
242                                    + "' in header field doesn't have a "
243                                    +"corresponding Java charset");
244                        }
245                    } else if (!CharsetUtil.isDecodingSupported(charset)) {
246                        decodedWord = body.substring(p1, p2 + 2);
247                        if (log.isWarnEnabled()) {
248                            log.warn("Current JDK doesn't support decoding "
249                                   + "of charset '" + charset
250                                   + "' (MIME charset '"
251                                   + mimeCharset + "')");
252                        }
253                    } else {
254                        if (enc.equalsIgnoreCase("Q")) {
255                            decodedWord = DecoderUtil.decodeQ(encodedWord, charset);
256                        } else if (enc.equalsIgnoreCase("B")) {
257                            decodedWord = DecoderUtil.decodeB(encodedWord, charset);
258                        } else {
259                            decodedWord = encodedWord;
260                            if (log.isWarnEnabled()) {
261                                log.warn("Warning: Unknown encoding in "
262                                        + "header field '" + enc + "'");
263                            }
264                        }
265                    }
266                }
267                p2 += 2;
268                sb.append(decodedWord);
269            }
270        } catch (Throwable t) {
271            log.error("Decoding header field body '" + body + "'", t);
272        }
273
274        return sb.toString();
275    }
276}
277