MimeStreamParser.java revision 8546e21e1e127845071c595beda16fc23eb0f58e
1/****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one   *
3 * or more contributor license agreements.  See the NOTICE file *
4 * distributed with this work for additional information        *
5 * regarding copyright ownership.  The ASF licenses this file   *
6 * to you under the Apache License, Version 2.0 (the            *
7 * "License"); you may not use this file except in compliance   *
8 * with the License.  You may obtain a copy of the License at   *
9 *                                                              *
10 *   http://www.apache.org/licenses/LICENSE-2.0                 *
11 *                                                              *
12 * Unless required by applicable law or agreed to in writing,   *
13 * software distributed under the License is distributed on an  *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15 * KIND, either express or implied.  See the License for the    *
16 * specific language governing permissions and limitations      *
17 * under the License.                                           *
18 ****************************************************************/
19
20package org.apache.james.mime4j;
21
22import com.android.email.Email;
23import com.android.email.mail.transport.LoggingInputStream;
24
25import org.apache.james.mime4j.decoder.Base64InputStream;
26import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
27
28import java.io.IOException;
29import java.io.InputStream;
30import java.util.BitSet;
31import java.util.LinkedList;
32
33/**
34 * <p>
35 * Parses MIME (or RFC822) message streams of bytes or characters and reports
36 * parsing events to a <code>ContentHandler</code> instance.
37 * </p>
38 * <p>
39 * Typical usage:<br/>
40 * <pre>
41 *      ContentHandler handler = new MyHandler();
42 *      MimeStreamParser parser = new MimeStreamParser();
43 *      parser.setContentHandler(handler);
44 *      parser.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
45 * </pre>
46 * <strong>NOTE:</strong> All lines must end with CRLF
47 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
48 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance.
49 *
50 *
51 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
52 */
53public class MimeStreamParser {
54    private static final Log log = LogFactory.getLog(MimeStreamParser.class);
55
56    private static final boolean DEBUG_LOG_MESSAGE = false; //DO NOT RELEASE AS 'TRUE'
57
58    private static BitSet fieldChars = null;
59
60    private RootInputStream rootStream = null;
61    private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>();
62    private ContentHandler handler = null;
63    private boolean raw = false;
64
65    static {
66        fieldChars = new BitSet();
67        for (int i = 0x21; i <= 0x39; i++) {
68            fieldChars.set(i);
69        }
70        for (int i = 0x3b; i <= 0x7e; i++) {
71            fieldChars.set(i);
72        }
73    }
74
75    /**
76     * Creates a new <code>MimeStreamParser</code> instance.
77     */
78    public MimeStreamParser() {
79    }
80
81    /**
82     * Parses a stream of bytes containing a MIME message.
83     *
84     * @param is the stream to parse.
85     * @throws IOException on I/O errors.
86     */
87    public void parse(InputStream is) throws IOException {
88        if (DEBUG_LOG_MESSAGE && Email.DEBUG) {
89            is = new LoggingInputStream(is, "MIME", true);
90        }
91        rootStream = new RootInputStream(is);
92        parseMessage(rootStream);
93    }
94
95    /**
96     * Determines if this parser is currently in raw mode.
97     *
98     * @return <code>true</code> if in raw mode, <code>false</code>
99     *         otherwise.
100     * @see #setRaw(boolean)
101     */
102    public boolean isRaw() {
103        return raw;
104    }
105
106    /**
107     * Enables or disables raw mode. In raw mode all future entities
108     * (messages or body parts) in the stream will be reported to the
109     * {@link ContentHandler#raw(InputStream)} handler method only.
110     * The stream will contain the entire unparsed entity contents
111     * including header fields and whatever is in the body.
112     *
113     * @param raw <code>true</code> enables raw mode, <code>false</code>
114     *        disables it.
115     */
116    public void setRaw(boolean raw) {
117        this.raw = raw;
118    }
119
120    /**
121     * Finishes the parsing and stops reading lines.
122     * NOTE: No more lines will be parsed but the parser
123     * will still call
124     * {@link ContentHandler#endMultipart()},
125     * {@link ContentHandler#endBodyPart()},
126     * {@link ContentHandler#endMessage()}, etc to match previous calls
127     * to
128     * {@link ContentHandler#startMultipart(BodyDescriptor)},
129     * {@link ContentHandler#startBodyPart()},
130     * {@link ContentHandler#startMessage()}, etc.
131     */
132    public void stop() {
133        rootStream.truncate();
134    }
135
136    /**
137     * Parses an entity which consists of a header followed by a body containing
138     * arbitrary data, body parts or an embedded message.
139     *
140     * @param is the stream to parse.
141     * @throws IOException on I/O errors.
142     */
143    private void parseEntity(InputStream is) throws IOException {
144        BodyDescriptor bd = parseHeader(is);
145
146        if (bd.isMultipart()) {
147            bodyDescriptors.addFirst(bd);
148
149            handler.startMultipart(bd);
150
151            MimeBoundaryInputStream tempIs =
152                new MimeBoundaryInputStream(is, bd.getBoundary());
153            handler.preamble(new CloseShieldInputStream(tempIs));
154            tempIs.consume();
155
156            while (tempIs.hasMoreParts()) {
157                tempIs = new MimeBoundaryInputStream(is, bd.getBoundary());
158                parseBodyPart(tempIs);
159                tempIs.consume();
160                if (tempIs.parentEOF()) {
161                    if (log.isWarnEnabled()) {
162                        log.warn("Line " + rootStream.getLineNumber()
163                                + ": Body part ended prematurely. "
164                                + "Higher level boundary detected or "
165                                + "EOF reached.");
166                    }
167                    break;
168                }
169            }
170
171            handler.epilogue(new CloseShieldInputStream(is));
172
173            handler.endMultipart();
174
175            bodyDescriptors.removeFirst();
176
177        } else if (bd.isMessage()) {
178            if (bd.isBase64Encoded()) {
179                log.warn("base64 encoded message/rfc822 detected");
180                is = new EOLConvertingInputStream(
181                        new Base64InputStream(is));
182            } else if (bd.isQuotedPrintableEncoded()) {
183                log.warn("quoted-printable encoded message/rfc822 detected");
184                is = new EOLConvertingInputStream(
185                        new QuotedPrintableInputStream(is));
186            }
187            bodyDescriptors.addFirst(bd);
188            parseMessage(is);
189            bodyDescriptors.removeFirst();
190        } else {
191            handler.body(bd, new CloseShieldInputStream(is));
192        }
193
194        /*
195         * Make sure the stream has been consumed.
196         */
197        while (is.read() != -1) {
198        }
199    }
200
201    private void parseMessage(InputStream is) throws IOException {
202        if (raw) {
203            handler.raw(new CloseShieldInputStream(is));
204        } else {
205            handler.startMessage();
206            parseEntity(is);
207            handler.endMessage();
208        }
209    }
210
211    private void parseBodyPart(InputStream is) throws IOException {
212        if (raw) {
213            handler.raw(new CloseShieldInputStream(is));
214        } else {
215            handler.startBodyPart();
216            parseEntity(is);
217            handler.endBodyPart();
218        }
219    }
220
221    /**
222     * Parses a header.
223     *
224     * @param is the stream to parse.
225     * @return a <code>BodyDescriptor</code> describing the body following
226     *         the header.
227     */
228    private BodyDescriptor parseHeader(InputStream is) throws IOException {
229        BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty()
230                        ? null : (BodyDescriptor) bodyDescriptors.getFirst());
231
232        handler.startHeader();
233
234        int lineNumber = rootStream.getLineNumber();
235
236        StringBuffer sb = new StringBuffer();
237        int curr = 0;
238        int prev = 0;
239        while ((curr = is.read()) != -1) {
240            if (curr == '\n' && (prev == '\n' || prev == 0)) {
241                /*
242                 * [\r]\n[\r]\n or an immediate \r\n have been seen.
243                 */
244                sb.deleteCharAt(sb.length() - 1);
245                break;
246            }
247            sb.append((char) curr);
248            prev = curr == '\r' ? prev : curr;
249        }
250
251        if (curr == -1 && log.isWarnEnabled()) {
252            log.warn("Line " + rootStream.getLineNumber()
253                    + ": Unexpected end of headers detected. "
254                    + "Boundary detected in header or EOF reached.");
255        }
256
257        int start = 0;
258        int pos = 0;
259        int startLineNumber = lineNumber;
260        while (pos < sb.length()) {
261            while (pos < sb.length() && sb.charAt(pos) != '\r') {
262                pos++;
263            }
264            if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') {
265                pos++;
266                continue;
267            }
268
269            if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) {
270
271                /*
272                 * field should be the complete field data excluding the
273                 * trailing \r\n.
274                 */
275                String field = sb.substring(start, pos);
276                start = pos + 2;
277
278                /*
279                 * Check for a valid field.
280                 */
281                int index = field.indexOf(':');
282                boolean valid = false;
283                if (index != -1 && fieldChars.get(field.charAt(0))) {
284                    valid = true;
285                    String fieldName = field.substring(0, index).trim();
286                    for (int i = 0; i < fieldName.length(); i++) {
287                        if (!fieldChars.get(fieldName.charAt(i))) {
288                            valid = false;
289                            break;
290                        }
291                    }
292
293                    if (valid) {
294                        handler.field(field);
295                        bd.addField(fieldName, field.substring(index + 1));
296                    }
297                }
298
299                if (!valid && log.isWarnEnabled()) {
300                    log.warn("Line " + startLineNumber
301                            + ": Ignoring invalid field: '" + field.trim() + "'");
302                }
303
304                startLineNumber = lineNumber;
305            }
306
307            pos += 2;
308            lineNumber++;
309        }
310
311        handler.endHeader();
312
313        return bd;
314    }
315
316    /**
317     * Sets the <code>ContentHandler</code> to use when reporting
318     * parsing events.
319     *
320     * @param h the <code>ContentHandler</code>.
321     */
322    public void setContentHandler(ContentHandler h) {
323        this.handler = h;
324    }
325
326}
327