MimeStreamParser.java revision bc47398187c6ffd132435e51d8d61e6ec79a79db
1/****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one   *
3 * or more contributor license agreements.  See the NOTICE file *
4 * distributed with this work for additional information        *
5 * regarding copyright ownership.  The ASF licenses this file   *
6 * to you under the Apache License, Version 2.0 (the            *
7 * "License"); you may not use this file except in compliance   *
8 * with the License.  You may obtain a copy of the License at   *
9 *                                                              *
10 *   http://www.apache.org/licenses/LICENSE-2.0                 *
11 *                                                              *
12 * Unless required by applicable law or agreed to in writing,   *
13 * software distributed under the License is distributed on an  *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15 * KIND, either express or implied.  See the License for the    *
16 * specific language governing permissions and limitations      *
17 * under the License.                                           *
18 ****************************************************************/
19
20package org.apache.james.mime4j;
21
22import com.android.mail.utils.LoggingInputStream;
23
24import org.apache.james.mime4j.decoder.Base64InputStream;
25import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
26
27import java.io.IOException;
28import java.io.InputStream;
29import java.util.BitSet;
30import java.util.LinkedList;
31
32/**
33 * <p>
34 * Parses MIME (or RFC822) message streams of bytes or characters and reports
35 * parsing events to a <code>ContentHandler</code> instance.
36 * </p>
37 * <p>
38 * Typical usage:<br/>
39 * <pre>
40 *      ContentHandler handler = new MyHandler();
41 *      MimeStreamParser parser = new MimeStreamParser();
42 *      parser.setContentHandler(handler);
43 *      parser.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
44 * </pre>
45 * <strong>NOTE:</strong> All lines must end with CRLF
46 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
47 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance.
48 *
49 *
50 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
51 */
52public class MimeStreamParser {
53    private static final Log log = LogFactory.getLog(MimeStreamParser.class);
54
55    private static final boolean DEBUG_LOG_MESSAGE = false; //DO NOT RELEASE AS 'TRUE'
56
57    private static BitSet fieldChars = null;
58
59    private RootInputStream rootStream = null;
60    private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>();
61    private ContentHandler handler = null;
62    private boolean raw = false;
63
64    static {
65        fieldChars = new BitSet();
66        for (int i = 0x21; i <= 0x39; i++) {
67            fieldChars.set(i);
68        }
69        for (int i = 0x3b; i <= 0x7e; i++) {
70            fieldChars.set(i);
71        }
72    }
73
74    /**
75     * Creates a new <code>MimeStreamParser</code> instance.
76     */
77    public MimeStreamParser() {
78    }
79
80    /**
81     * Parses a stream of bytes containing a MIME message.
82     *
83     * @param is the stream to parse.
84     * @throws IOException on I/O errors.
85     */
86    public void parse(InputStream is) throws IOException {
87        if (DEBUG_LOG_MESSAGE) {
88            is = new LoggingInputStream(is, "MIME", true);
89        }
90        rootStream = new RootInputStream(is);
91        parseMessage(rootStream);
92    }
93
94    /**
95     * Determines if this parser is currently in raw mode.
96     *
97     * @return <code>true</code> if in raw mode, <code>false</code>
98     *         otherwise.
99     * @see #setRaw(boolean)
100     */
101    public boolean isRaw() {
102        return raw;
103    }
104
105    /**
106     * Enables or disables raw mode. In raw mode all future entities
107     * (messages or body parts) in the stream will be reported to the
108     * {@link ContentHandler#raw(InputStream)} handler method only.
109     * The stream will contain the entire unparsed entity contents
110     * including header fields and whatever is in the body.
111     *
112     * @param raw <code>true</code> enables raw mode, <code>false</code>
113     *        disables it.
114     */
115    public void setRaw(boolean raw) {
116        this.raw = raw;
117    }
118
119    /**
120     * Finishes the parsing and stops reading lines.
121     * NOTE: No more lines will be parsed but the parser
122     * will still call
123     * {@link ContentHandler#endMultipart()},
124     * {@link ContentHandler#endBodyPart()},
125     * {@link ContentHandler#endMessage()}, etc to match previous calls
126     * to
127     * {@link ContentHandler#startMultipart(BodyDescriptor)},
128     * {@link ContentHandler#startBodyPart()},
129     * {@link ContentHandler#startMessage()}, etc.
130     */
131    public void stop() {
132        rootStream.truncate();
133    }
134
135    /**
136     * Parses an entity which consists of a header followed by a body containing
137     * arbitrary data, body parts or an embedded message.
138     *
139     * @param is the stream to parse.
140     * @throws IOException on I/O errors.
141     */
142    private void parseEntity(InputStream is) throws IOException {
143        BodyDescriptor bd = parseHeader(is);
144
145        if (bd.isMultipart()) {
146            bodyDescriptors.addFirst(bd);
147
148            handler.startMultipart(bd);
149
150            MimeBoundaryInputStream tempIs =
151                new MimeBoundaryInputStream(is, bd.getBoundary());
152            handler.preamble(new CloseShieldInputStream(tempIs));
153            tempIs.consume();
154
155            while (tempIs.hasMoreParts()) {
156                tempIs = new MimeBoundaryInputStream(is, bd.getBoundary());
157                parseBodyPart(tempIs);
158                tempIs.consume();
159                if (tempIs.parentEOF()) {
160//                    if (log.isWarnEnabled()) {
161//                        log.warn("Line " + rootStream.getLineNumber()
162//                                + ": Body part ended prematurely. "
163//                                + "Higher level boundary detected or "
164//                                + "EOF reached.");
165//                    }
166                    break;
167                }
168            }
169
170            handler.epilogue(new CloseShieldInputStream(is));
171
172            handler.endMultipart();
173
174            bodyDescriptors.removeFirst();
175
176        } else if (bd.isMessage()) {
177            if (bd.isBase64Encoded()) {
178                log.warn("base64 encoded message/rfc822 detected");
179                is = new EOLConvertingInputStream(
180                        new Base64InputStream(is));
181            } else if (bd.isQuotedPrintableEncoded()) {
182                log.warn("quoted-printable encoded message/rfc822 detected");
183                is = new EOLConvertingInputStream(
184                        new QuotedPrintableInputStream(is));
185            }
186            bodyDescriptors.addFirst(bd);
187            parseMessage(is);
188            bodyDescriptors.removeFirst();
189        } else {
190            handler.body(bd, new CloseShieldInputStream(is));
191        }
192
193        /*
194         * Make sure the stream has been consumed.
195         */
196        while (is.read() != -1) {
197        }
198    }
199
200    private void parseMessage(InputStream is) throws IOException {
201        if (raw) {
202            handler.raw(new CloseShieldInputStream(is));
203        } else {
204            handler.startMessage();
205            parseEntity(is);
206            handler.endMessage();
207        }
208    }
209
210    private void parseBodyPart(InputStream is) throws IOException {
211        if (raw) {
212            handler.raw(new CloseShieldInputStream(is));
213        } else {
214            handler.startBodyPart();
215            parseEntity(is);
216            handler.endBodyPart();
217        }
218    }
219
220    /**
221     * Parses a header.
222     *
223     * @param is the stream to parse.
224     * @return a <code>BodyDescriptor</code> describing the body following
225     *         the header.
226     */
227    private BodyDescriptor parseHeader(InputStream is) throws IOException {
228        BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty()
229                        ? null : (BodyDescriptor) bodyDescriptors.getFirst());
230
231        handler.startHeader();
232
233        int lineNumber = rootStream.getLineNumber();
234
235        StringBuffer sb = new StringBuffer();
236        int curr = 0;
237        int prev = 0;
238        while ((curr = is.read()) != -1) {
239            if (curr == '\n' && (prev == '\n' || prev == 0)) {
240                /*
241                 * [\r]\n[\r]\n or an immediate \r\n have been seen.
242                 */
243                sb.deleteCharAt(sb.length() - 1);
244                break;
245            }
246            sb.append((char) curr);
247            prev = curr == '\r' ? prev : curr;
248        }
249
250//        if (curr == -1 && log.isWarnEnabled()) {
251//            log.warn("Line " + rootStream.getLineNumber()
252//                    + ": Unexpected end of headers detected. "
253//                    + "Boundary detected in header or EOF reached.");
254//        }
255
256        int start = 0;
257        int pos = 0;
258        int startLineNumber = lineNumber;
259        while (pos < sb.length()) {
260            while (pos < sb.length() && sb.charAt(pos) != '\r') {
261                pos++;
262            }
263            if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') {
264                pos++;
265                continue;
266            }
267
268            if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) {
269
270                /*
271                 * field should be the complete field data excluding the
272                 * trailing \r\n.
273                 */
274                String field = sb.substring(start, pos);
275                start = pos + 2;
276
277                /*
278                 * Check for a valid field.
279                 */
280                int index = field.indexOf(':');
281                boolean valid = false;
282                if (index != -1 && fieldChars.get(field.charAt(0))) {
283                    valid = true;
284                    String fieldName = field.substring(0, index).trim();
285                    for (int i = 0; i < fieldName.length(); i++) {
286                        if (!fieldChars.get(fieldName.charAt(i))) {
287                            valid = false;
288                            break;
289                        }
290                    }
291
292                    if (valid) {
293                        handler.field(field);
294                        bd.addField(fieldName, field.substring(index + 1));
295                    }
296                }
297
298                if (!valid && log.isWarnEnabled()) {
299                    log.warn("Line " + startLineNumber
300                            + ": Ignoring invalid field: '" + field.trim() + "'");
301                }
302
303                startLineNumber = lineNumber;
304            }
305
306            pos += 2;
307            lineNumber++;
308        }
309
310        handler.endHeader();
311
312        return bd;
313    }
314
315    /**
316     * Sets the <code>ContentHandler</code> to use when reporting
317     * parsing events.
318     *
319     * @param h the <code>ContentHandler</code>.
320     */
321    public void setContentHandler(ContentHandler h) {
322        this.handler = h;
323    }
324
325}
326