1/****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one   *
3 * or more contributor license agreements.  See the NOTICE file *
4 * distributed with this work for additional information        *
5 * regarding copyright ownership.  The ASF licenses this file   *
6 * to you under the Apache License, Version 2.0 (the            *
7 * "License"); you may not use this file except in compliance   *
8 * with the License.  You may obtain a copy of the License at   *
9 *                                                              *
10 *   http://www.apache.org/licenses/LICENSE-2.0                 *
11 *                                                              *
12 * Unless required by applicable law or agreed to in writing,   *
13 * software distributed under the License is distributed on an  *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15 * KIND, either express or implied.  See the License for the    *
16 * specific language governing permissions and limitations      *
17 * under the License.                                           *
18 ****************************************************************/
19
20package org.apache.james.mime4j;
21
22import org.apache.james.mime4j.decoder.Base64InputStream;
23import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
24
25import java.io.IOException;
26import java.io.InputStream;
27import java.util.BitSet;
28import java.util.LinkedList;
29
30/**
31 * <p>
32 * Parses MIME (or RFC822) message streams of bytes or characters and reports
33 * parsing events to a <code>ContentHandler</code> instance.
34 * </p>
35 * <p>
36 * Typical usage:<br/>
37 * <pre>
38 *      ContentHandler handler = new MyHandler();
39 *      MimeStreamParser parser = new MimeStreamParser();
40 *      parser.setContentHandler(handler);
41 *      parser.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
42 * </pre>
43 * <strong>NOTE:</strong> All lines must end with CRLF
44 * (<code>\r\n</code>). If you are unsure of the line endings in your stream
45 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance.
46 *
47 *
48 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
49 */
50public class MimeStreamParser {
51    private static final Log log = LogFactory.getLog(MimeStreamParser.class);
52
53    private static BitSet fieldChars = null;
54
55    private RootInputStream rootStream = null;
56    private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>();
57    private ContentHandler handler = null;
58    private boolean raw = false;
59    private boolean prematureEof = false;
60
61    static {
62        fieldChars = new BitSet();
63        for (int i = 0x21; i <= 0x39; i++) {
64            fieldChars.set(i);
65        }
66        for (int i = 0x3b; i <= 0x7e; i++) {
67            fieldChars.set(i);
68        }
69    }
70
71    /**
72     * Creates a new <code>MimeStreamParser</code> instance.
73     */
74    public MimeStreamParser() {
75    }
76
77    /**
78     * Parses a stream of bytes containing a MIME message.
79     *
80     * @param is the stream to parse.
81     * @throws IOException on I/O errors.
82     */
83    public void parse(InputStream is) throws IOException {
84        rootStream = new RootInputStream(is);
85        parseMessage(rootStream);
86    }
87
88    /**
89     * Determines if this parser is currently in raw mode.
90     *
91     * @return <code>true</code> if in raw mode, <code>false</code>
92     *         otherwise.
93     * @see #setRaw(boolean)
94     */
95    public boolean isRaw() {
96        return raw;
97    }
98
99    /**
100     * Enables or disables raw mode. In raw mode all future entities
101     * (messages or body parts) in the stream will be reported to the
102     * {@link ContentHandler#raw(InputStream)} handler method only.
103     * The stream will contain the entire unparsed entity contents
104     * including header fields and whatever is in the body.
105     *
106     * @param raw <code>true</code> enables raw mode, <code>false</code>
107     *        disables it.
108     */
109    public void setRaw(boolean raw) {
110        this.raw = raw;
111    }
112
113    /**
114     * Finishes the parsing and stops reading lines.
115     * NOTE: No more lines will be parsed but the parser
116     * will still call
117     * {@link ContentHandler#endMultipart()},
118     * {@link ContentHandler#endBodyPart()},
119     * {@link ContentHandler#endMessage()}, etc to match previous calls
120     * to
121     * {@link ContentHandler#startMultipart(BodyDescriptor)},
122     * {@link ContentHandler#startBodyPart()},
123     * {@link ContentHandler#startMessage()}, etc.
124     */
125    public void stop() {
126        rootStream.truncate();
127    }
128
129    /**
130     * Parses an entity which consists of a header followed by a body containing
131     * arbitrary data, body parts or an embedded message.
132     *
133     * @param is the stream to parse.
134     * @throws IOException on I/O errors.
135     */
136    private void parseEntity(InputStream is) throws IOException {
137        BodyDescriptor bd = parseHeader(is);
138
139        if (bd.isMultipart()) {
140            bodyDescriptors.addFirst(bd);
141
142            handler.startMultipart(bd);
143
144            MimeBoundaryInputStream tempIs =
145                new MimeBoundaryInputStream(is, bd.getBoundary());
146            handler.preamble(new CloseShieldInputStream(tempIs));
147            tempIs.consume();
148
149            while (tempIs.hasMoreParts()) {
150                tempIs = new MimeBoundaryInputStream(is, bd.getBoundary());
151                parseBodyPart(tempIs);
152                tempIs.consume();
153                if (tempIs.parentEOF()) {
154                    prematureEof = true;
155//                    if (log.isWarnEnabled()) {
156//                        log.warn("Line " + rootStream.getLineNumber()
157//                                + ": Body part ended prematurely. "
158//                                + "Higher level boundary detected or "
159//                                + "EOF reached.");
160//                    }
161                    break;
162                }
163            }
164
165            handler.epilogue(new CloseShieldInputStream(is));
166
167            handler.endMultipart();
168
169            bodyDescriptors.removeFirst();
170
171        } else if (bd.isMessage()) {
172            if (bd.isBase64Encoded()) {
173                log.warn("base64 encoded message/rfc822 detected");
174                is = new EOLConvertingInputStream(
175                        new Base64InputStream(is));
176            } else if (bd.isQuotedPrintableEncoded()) {
177                log.warn("quoted-printable encoded message/rfc822 detected");
178                is = new EOLConvertingInputStream(
179                        new QuotedPrintableInputStream(is));
180            }
181            bodyDescriptors.addFirst(bd);
182            parseMessage(is);
183            bodyDescriptors.removeFirst();
184        } else {
185            handler.body(bd, new CloseShieldInputStream(is));
186        }
187
188        /*
189         * Make sure the stream has been consumed.
190         */
191        while (is.read() != -1) {
192        }
193    }
194
195    private void parseMessage(InputStream is) throws IOException {
196        if (raw) {
197            handler.raw(new CloseShieldInputStream(is));
198        } else {
199            handler.startMessage();
200            parseEntity(is);
201            handler.endMessage();
202        }
203    }
204
205    public boolean getPrematureEof() {
206        return prematureEof;
207    }
208
209    private void parseBodyPart(InputStream is) throws IOException {
210        if (raw) {
211            handler.raw(new CloseShieldInputStream(is));
212        } else {
213            handler.startBodyPart();
214            parseEntity(is);
215            handler.endBodyPart();
216        }
217    }
218
219    /**
220     * Parses a header.
221     *
222     * @param is the stream to parse.
223     * @return a <code>BodyDescriptor</code> describing the body following
224     *         the header.
225     */
226    private BodyDescriptor parseHeader(InputStream is) throws IOException {
227        BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty()
228                        ? null : (BodyDescriptor) bodyDescriptors.getFirst());
229
230        handler.startHeader();
231
232        int lineNumber = rootStream.getLineNumber();
233
234        StringBuffer sb = new StringBuffer();
235        int curr = 0;
236        int prev = 0;
237        while ((curr = is.read()) != -1) {
238            if (curr == '\n' && (prev == '\n' || prev == 0)) {
239                /*
240                 * [\r]\n[\r]\n or an immediate \r\n have been seen.
241                 */
242                sb.deleteCharAt(sb.length() - 1);
243                break;
244            }
245            sb.append((char) curr);
246            prev = curr == '\r' ? prev : curr;
247        }
248
249//        if (curr == -1 && log.isWarnEnabled()) {
250//            log.warn("Line " + rootStream.getLineNumber()
251//                    + ": Unexpected end of headers detected. "
252//                    + "Boundary detected in header or EOF reached.");
253//        }
254
255        int start = 0;
256        int pos = 0;
257        int startLineNumber = lineNumber;
258        while (pos < sb.length()) {
259            while (pos < sb.length() && sb.charAt(pos) != '\r') {
260                pos++;
261            }
262            if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') {
263                pos++;
264                continue;
265            }
266
267            if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) {
268
269                /*
270                 * field should be the complete field data excluding the
271                 * trailing \r\n.
272                 */
273                String field = sb.substring(start, pos);
274                start = pos + 2;
275
276                /*
277                 * Check for a valid field.
278                 */
279                int index = field.indexOf(':');
280                boolean valid = false;
281                if (index != -1 && fieldChars.get(field.charAt(0))) {
282                    valid = true;
283                    String fieldName = field.substring(0, index).trim();
284                    for (int i = 0; i < fieldName.length(); i++) {
285                        if (!fieldChars.get(fieldName.charAt(i))) {
286                            valid = false;
287                            break;
288                        }
289                    }
290
291                    if (valid) {
292                        handler.field(field);
293                        bd.addField(fieldName, field.substring(index + 1));
294                    }
295                }
296
297                if (!valid && log.isWarnEnabled()) {
298                    log.warn("Line " + startLineNumber
299                            + ": Ignoring invalid field: '" + field.trim() + "'");
300                }
301
302                startLineNumber = lineNumber;
303            }
304
305            pos += 2;
306            lineNumber++;
307        }
308
309        handler.endHeader();
310
311        return bd;
312    }
313
314    /**
315     * Sets the <code>ContentHandler</code> to use when reporting
316     * parsing events.
317     *
318     * @param h the <code>ContentHandler</code>.
319     */
320    public void setContentHandler(ContentHandler h) {
321        this.handler = h;
322    }
323
324}