1/**************************************************************** 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 ****************************************************************/ 19 20package org.apache.james.mime4j; 21 22import org.apache.james.mime4j.decoder.Base64InputStream; 23import org.apache.james.mime4j.decoder.QuotedPrintableInputStream; 24 25import java.io.IOException; 26import java.io.InputStream; 27import java.util.BitSet; 28import java.util.LinkedList; 29 30/** 31 * <p> 32 * Parses MIME (or RFC822) message streams of bytes or characters and reports 33 * parsing events to a <code>ContentHandler</code> instance. 34 * </p> 35 * <p> 36 * Typical usage:<br/> 37 * <pre> 38 * ContentHandler handler = new MyHandler(); 39 * MimeStreamParser parser = new MimeStreamParser(); 40 * parser.setContentHandler(handler); 41 * parser.parse(new BufferedInputStream(new FileInputStream("mime.msg"))); 42 * </pre> 43 * <strong>NOTE:</strong> All lines must end with CRLF 44 * (<code>\r\n</code>). If you are unsure of the line endings in your stream 45 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance. 46 * 47 * 48 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $ 49 */ 50public class MimeStreamParser { 51 private static final Log log = LogFactory.getLog(MimeStreamParser.class); 52 53 private static BitSet fieldChars = null; 54 55 private RootInputStream rootStream = null; 56 private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>(); 57 private ContentHandler handler = null; 58 private boolean raw = false; 59 private boolean prematureEof = false; 60 61 static { 62 fieldChars = new BitSet(); 63 for (int i = 0x21; i <= 0x39; i++) { 64 fieldChars.set(i); 65 } 66 for (int i = 0x3b; i <= 0x7e; i++) { 67 fieldChars.set(i); 68 } 69 } 70 71 /** 72 * Creates a new <code>MimeStreamParser</code> instance. 73 */ 74 public MimeStreamParser() { 75 } 76 77 /** 78 * Parses a stream of bytes containing a MIME message. 79 * 80 * @param is the stream to parse. 81 * @throws IOException on I/O errors. 82 */ 83 public void parse(InputStream is) throws IOException { 84 rootStream = new RootInputStream(is); 85 parseMessage(rootStream); 86 } 87 88 /** 89 * Determines if this parser is currently in raw mode. 90 * 91 * @return <code>true</code> if in raw mode, <code>false</code> 92 * otherwise. 93 * @see #setRaw(boolean) 94 */ 95 public boolean isRaw() { 96 return raw; 97 } 98 99 /** 100 * Enables or disables raw mode. In raw mode all future entities 101 * (messages or body parts) in the stream will be reported to the 102 * {@link ContentHandler#raw(InputStream)} handler method only. 103 * The stream will contain the entire unparsed entity contents 104 * including header fields and whatever is in the body. 105 * 106 * @param raw <code>true</code> enables raw mode, <code>false</code> 107 * disables it. 108 */ 109 public void setRaw(boolean raw) { 110 this.raw = raw; 111 } 112 113 /** 114 * Finishes the parsing and stops reading lines. 115 * NOTE: No more lines will be parsed but the parser 116 * will still call 117 * {@link ContentHandler#endMultipart()}, 118 * {@link ContentHandler#endBodyPart()}, 119 * {@link ContentHandler#endMessage()}, etc to match previous calls 120 * to 121 * {@link ContentHandler#startMultipart(BodyDescriptor)}, 122 * {@link ContentHandler#startBodyPart()}, 123 * {@link ContentHandler#startMessage()}, etc. 124 */ 125 public void stop() { 126 rootStream.truncate(); 127 } 128 129 /** 130 * Parses an entity which consists of a header followed by a body containing 131 * arbitrary data, body parts or an embedded message. 132 * 133 * @param is the stream to parse. 134 * @throws IOException on I/O errors. 135 */ 136 private void parseEntity(InputStream is) throws IOException { 137 BodyDescriptor bd = parseHeader(is); 138 139 if (bd.isMultipart()) { 140 bodyDescriptors.addFirst(bd); 141 142 handler.startMultipart(bd); 143 144 MimeBoundaryInputStream tempIs = 145 new MimeBoundaryInputStream(is, bd.getBoundary()); 146 handler.preamble(new CloseShieldInputStream(tempIs)); 147 tempIs.consume(); 148 149 while (tempIs.hasMoreParts()) { 150 tempIs = new MimeBoundaryInputStream(is, bd.getBoundary()); 151 parseBodyPart(tempIs); 152 tempIs.consume(); 153 if (tempIs.parentEOF()) { 154 prematureEof = true; 155// if (log.isWarnEnabled()) { 156// log.warn("Line " + rootStream.getLineNumber() 157// + ": Body part ended prematurely. " 158// + "Higher level boundary detected or " 159// + "EOF reached."); 160// } 161 break; 162 } 163 } 164 165 handler.epilogue(new CloseShieldInputStream(is)); 166 167 handler.endMultipart(); 168 169 bodyDescriptors.removeFirst(); 170 171 } else if (bd.isMessage()) { 172 if (bd.isBase64Encoded()) { 173 log.warn("base64 encoded message/rfc822 detected"); 174 is = new EOLConvertingInputStream( 175 new Base64InputStream(is)); 176 } else if (bd.isQuotedPrintableEncoded()) { 177 log.warn("quoted-printable encoded message/rfc822 detected"); 178 is = new EOLConvertingInputStream( 179 new QuotedPrintableInputStream(is)); 180 } 181 bodyDescriptors.addFirst(bd); 182 parseMessage(is); 183 bodyDescriptors.removeFirst(); 184 } else { 185 handler.body(bd, new CloseShieldInputStream(is)); 186 } 187 188 /* 189 * Make sure the stream has been consumed. 190 */ 191 while (is.read() != -1) { 192 } 193 } 194 195 private void parseMessage(InputStream is) throws IOException { 196 if (raw) { 197 handler.raw(new CloseShieldInputStream(is)); 198 } else { 199 handler.startMessage(); 200 parseEntity(is); 201 handler.endMessage(); 202 } 203 } 204 205 public boolean getPrematureEof() { 206 return prematureEof; 207 } 208 209 private void parseBodyPart(InputStream is) throws IOException { 210 if (raw) { 211 handler.raw(new CloseShieldInputStream(is)); 212 } else { 213 handler.startBodyPart(); 214 parseEntity(is); 215 handler.endBodyPart(); 216 } 217 } 218 219 /** 220 * Parses a header. 221 * 222 * @param is the stream to parse. 223 * @return a <code>BodyDescriptor</code> describing the body following 224 * the header. 225 */ 226 private BodyDescriptor parseHeader(InputStream is) throws IOException { 227 BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty() 228 ? null : (BodyDescriptor) bodyDescriptors.getFirst()); 229 230 handler.startHeader(); 231 232 int lineNumber = rootStream.getLineNumber(); 233 234 StringBuffer sb = new StringBuffer(); 235 int curr = 0; 236 int prev = 0; 237 while ((curr = is.read()) != -1) { 238 if (curr == '\n' && (prev == '\n' || prev == 0)) { 239 /* 240 * [\r]\n[\r]\n or an immediate \r\n have been seen. 241 */ 242 sb.deleteCharAt(sb.length() - 1); 243 break; 244 } 245 sb.append((char) curr); 246 prev = curr == '\r' ? prev : curr; 247 } 248 249// if (curr == -1 && log.isWarnEnabled()) { 250// log.warn("Line " + rootStream.getLineNumber() 251// + ": Unexpected end of headers detected. " 252// + "Boundary detected in header or EOF reached."); 253// } 254 255 int start = 0; 256 int pos = 0; 257 int startLineNumber = lineNumber; 258 while (pos < sb.length()) { 259 while (pos < sb.length() && sb.charAt(pos) != '\r') { 260 pos++; 261 } 262 if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') { 263 pos++; 264 continue; 265 } 266 267 if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) { 268 269 /* 270 * field should be the complete field data excluding the 271 * trailing \r\n. 272 */ 273 String field = sb.substring(start, pos); 274 start = pos + 2; 275 276 /* 277 * Check for a valid field. 278 */ 279 int index = field.indexOf(':'); 280 boolean valid = false; 281 if (index != -1 && fieldChars.get(field.charAt(0))) { 282 valid = true; 283 String fieldName = field.substring(0, index).trim(); 284 for (int i = 0; i < fieldName.length(); i++) { 285 if (!fieldChars.get(fieldName.charAt(i))) { 286 valid = false; 287 break; 288 } 289 } 290 291 if (valid) { 292 handler.field(field); 293 bd.addField(fieldName, field.substring(index + 1)); 294 } 295 } 296 297 if (!valid && log.isWarnEnabled()) { 298 log.warn("Line " + startLineNumber 299 + ": Ignoring invalid field: '" + field.trim() + "'"); 300 } 301 302 startLineNumber = lineNumber; 303 } 304 305 pos += 2; 306 lineNumber++; 307 } 308 309 handler.endHeader(); 310 311 return bd; 312 } 313 314 /** 315 * Sets the <code>ContentHandler</code> to use when reporting 316 * parsing events. 317 * 318 * @param h the <code>ContentHandler</code>. 319 */ 320 public void setContentHandler(ContentHandler h) { 321 this.handler = h; 322 } 323 324}