MimeStreamParser.java revision 8546e21e1e127845071c595beda16fc23eb0f58e
1/**************************************************************** 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 ****************************************************************/ 19 20package org.apache.james.mime4j; 21 22import com.android.email.Email; 23import com.android.email.mail.transport.LoggingInputStream; 24 25import org.apache.james.mime4j.decoder.Base64InputStream; 26import org.apache.james.mime4j.decoder.QuotedPrintableInputStream; 27 28import java.io.IOException; 29import java.io.InputStream; 30import java.util.BitSet; 31import java.util.LinkedList; 32 33/** 34 * <p> 35 * Parses MIME (or RFC822) message streams of bytes or characters and reports 36 * parsing events to a <code>ContentHandler</code> instance. 37 * </p> 38 * <p> 39 * Typical usage:<br/> 40 * <pre> 41 * ContentHandler handler = new MyHandler(); 42 * MimeStreamParser parser = new MimeStreamParser(); 43 * parser.setContentHandler(handler); 44 * parser.parse(new BufferedInputStream(new FileInputStream("mime.msg"))); 45 * </pre> 46 * <strong>NOTE:</strong> All lines must end with CRLF 47 * (<code>\r\n</code>). If you are unsure of the line endings in your stream 48 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance. 49 * 50 * 51 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $ 52 */ 53public class MimeStreamParser { 54 private static final Log log = LogFactory.getLog(MimeStreamParser.class); 55 56 private static final boolean DEBUG_LOG_MESSAGE = false; //DO NOT RELEASE AS 'TRUE' 57 58 private static BitSet fieldChars = null; 59 60 private RootInputStream rootStream = null; 61 private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>(); 62 private ContentHandler handler = null; 63 private boolean raw = false; 64 65 static { 66 fieldChars = new BitSet(); 67 for (int i = 0x21; i <= 0x39; i++) { 68 fieldChars.set(i); 69 } 70 for (int i = 0x3b; i <= 0x7e; i++) { 71 fieldChars.set(i); 72 } 73 } 74 75 /** 76 * Creates a new <code>MimeStreamParser</code> instance. 77 */ 78 public MimeStreamParser() { 79 } 80 81 /** 82 * Parses a stream of bytes containing a MIME message. 83 * 84 * @param is the stream to parse. 85 * @throws IOException on I/O errors. 86 */ 87 public void parse(InputStream is) throws IOException { 88 if (DEBUG_LOG_MESSAGE && Email.DEBUG) { 89 is = new LoggingInputStream(is, "MIME", true); 90 } 91 rootStream = new RootInputStream(is); 92 parseMessage(rootStream); 93 } 94 95 /** 96 * Determines if this parser is currently in raw mode. 97 * 98 * @return <code>true</code> if in raw mode, <code>false</code> 99 * otherwise. 100 * @see #setRaw(boolean) 101 */ 102 public boolean isRaw() { 103 return raw; 104 } 105 106 /** 107 * Enables or disables raw mode. In raw mode all future entities 108 * (messages or body parts) in the stream will be reported to the 109 * {@link ContentHandler#raw(InputStream)} handler method only. 110 * The stream will contain the entire unparsed entity contents 111 * including header fields and whatever is in the body. 112 * 113 * @param raw <code>true</code> enables raw mode, <code>false</code> 114 * disables it. 115 */ 116 public void setRaw(boolean raw) { 117 this.raw = raw; 118 } 119 120 /** 121 * Finishes the parsing and stops reading lines. 122 * NOTE: No more lines will be parsed but the parser 123 * will still call 124 * {@link ContentHandler#endMultipart()}, 125 * {@link ContentHandler#endBodyPart()}, 126 * {@link ContentHandler#endMessage()}, etc to match previous calls 127 * to 128 * {@link ContentHandler#startMultipart(BodyDescriptor)}, 129 * {@link ContentHandler#startBodyPart()}, 130 * {@link ContentHandler#startMessage()}, etc. 131 */ 132 public void stop() { 133 rootStream.truncate(); 134 } 135 136 /** 137 * Parses an entity which consists of a header followed by a body containing 138 * arbitrary data, body parts or an embedded message. 139 * 140 * @param is the stream to parse. 141 * @throws IOException on I/O errors. 142 */ 143 private void parseEntity(InputStream is) throws IOException { 144 BodyDescriptor bd = parseHeader(is); 145 146 if (bd.isMultipart()) { 147 bodyDescriptors.addFirst(bd); 148 149 handler.startMultipart(bd); 150 151 MimeBoundaryInputStream tempIs = 152 new MimeBoundaryInputStream(is, bd.getBoundary()); 153 handler.preamble(new CloseShieldInputStream(tempIs)); 154 tempIs.consume(); 155 156 while (tempIs.hasMoreParts()) { 157 tempIs = new MimeBoundaryInputStream(is, bd.getBoundary()); 158 parseBodyPart(tempIs); 159 tempIs.consume(); 160 if (tempIs.parentEOF()) { 161 if (log.isWarnEnabled()) { 162 log.warn("Line " + rootStream.getLineNumber() 163 + ": Body part ended prematurely. " 164 + "Higher level boundary detected or " 165 + "EOF reached."); 166 } 167 break; 168 } 169 } 170 171 handler.epilogue(new CloseShieldInputStream(is)); 172 173 handler.endMultipart(); 174 175 bodyDescriptors.removeFirst(); 176 177 } else if (bd.isMessage()) { 178 if (bd.isBase64Encoded()) { 179 log.warn("base64 encoded message/rfc822 detected"); 180 is = new EOLConvertingInputStream( 181 new Base64InputStream(is)); 182 } else if (bd.isQuotedPrintableEncoded()) { 183 log.warn("quoted-printable encoded message/rfc822 detected"); 184 is = new EOLConvertingInputStream( 185 new QuotedPrintableInputStream(is)); 186 } 187 bodyDescriptors.addFirst(bd); 188 parseMessage(is); 189 bodyDescriptors.removeFirst(); 190 } else { 191 handler.body(bd, new CloseShieldInputStream(is)); 192 } 193 194 /* 195 * Make sure the stream has been consumed. 196 */ 197 while (is.read() != -1) { 198 } 199 } 200 201 private void parseMessage(InputStream is) throws IOException { 202 if (raw) { 203 handler.raw(new CloseShieldInputStream(is)); 204 } else { 205 handler.startMessage(); 206 parseEntity(is); 207 handler.endMessage(); 208 } 209 } 210 211 private void parseBodyPart(InputStream is) throws IOException { 212 if (raw) { 213 handler.raw(new CloseShieldInputStream(is)); 214 } else { 215 handler.startBodyPart(); 216 parseEntity(is); 217 handler.endBodyPart(); 218 } 219 } 220 221 /** 222 * Parses a header. 223 * 224 * @param is the stream to parse. 225 * @return a <code>BodyDescriptor</code> describing the body following 226 * the header. 227 */ 228 private BodyDescriptor parseHeader(InputStream is) throws IOException { 229 BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty() 230 ? null : (BodyDescriptor) bodyDescriptors.getFirst()); 231 232 handler.startHeader(); 233 234 int lineNumber = rootStream.getLineNumber(); 235 236 StringBuffer sb = new StringBuffer(); 237 int curr = 0; 238 int prev = 0; 239 while ((curr = is.read()) != -1) { 240 if (curr == '\n' && (prev == '\n' || prev == 0)) { 241 /* 242 * [\r]\n[\r]\n or an immediate \r\n have been seen. 243 */ 244 sb.deleteCharAt(sb.length() - 1); 245 break; 246 } 247 sb.append((char) curr); 248 prev = curr == '\r' ? prev : curr; 249 } 250 251 if (curr == -1 && log.isWarnEnabled()) { 252 log.warn("Line " + rootStream.getLineNumber() 253 + ": Unexpected end of headers detected. " 254 + "Boundary detected in header or EOF reached."); 255 } 256 257 int start = 0; 258 int pos = 0; 259 int startLineNumber = lineNumber; 260 while (pos < sb.length()) { 261 while (pos < sb.length() && sb.charAt(pos) != '\r') { 262 pos++; 263 } 264 if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') { 265 pos++; 266 continue; 267 } 268 269 if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) { 270 271 /* 272 * field should be the complete field data excluding the 273 * trailing \r\n. 274 */ 275 String field = sb.substring(start, pos); 276 start = pos + 2; 277 278 /* 279 * Check for a valid field. 280 */ 281 int index = field.indexOf(':'); 282 boolean valid = false; 283 if (index != -1 && fieldChars.get(field.charAt(0))) { 284 valid = true; 285 String fieldName = field.substring(0, index).trim(); 286 for (int i = 0; i < fieldName.length(); i++) { 287 if (!fieldChars.get(fieldName.charAt(i))) { 288 valid = false; 289 break; 290 } 291 } 292 293 if (valid) { 294 handler.field(field); 295 bd.addField(fieldName, field.substring(index + 1)); 296 } 297 } 298 299 if (!valid && log.isWarnEnabled()) { 300 log.warn("Line " + startLineNumber 301 + ": Ignoring invalid field: '" + field.trim() + "'"); 302 } 303 304 startLineNumber = lineNumber; 305 } 306 307 pos += 2; 308 lineNumber++; 309 } 310 311 handler.endHeader(); 312 313 return bd; 314 } 315 316 /** 317 * Sets the <code>ContentHandler</code> to use when reporting 318 * parsing events. 319 * 320 * @param h the <code>ContentHandler</code>. 321 */ 322 public void setContentHandler(ContentHandler h) { 323 this.handler = h; 324 } 325 326} 327