1 /****************************************************************
2  * Licensed to the Apache Software Foundation (ASF) under one   *
3  * or more contributor license agreements.  See the NOTICE file *
4  * distributed with this work for additional information        *
5  * regarding copyright ownership.  The ASF licenses this file   *
6  * to you under the Apache License, Version 2.0 (the            *
7  * "License"); you may not use this file except in compliance   *
8  * with the License.  You may obtain a copy of the License at   *
9  *                                                              *
10  *   http://www.apache.org/licenses/LICENSE-2.0                 *
11  *                                                              *
12  * Unless required by applicable law or agreed to in writing,   *
13  * software distributed under the License is distributed on an  *
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15  * KIND, either express or implied.  See the License for the    *
16  * specific language governing permissions and limitations      *
17  * under the License.                                           *
18  ****************************************************************/
19 
20 package org.apache.james.mime4j;
21 
22 import org.apache.james.mime4j.decoder.Base64InputStream;
23 import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
24 
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.util.BitSet;
28 import java.util.LinkedList;
29 
30 /**
31  * <p>
32  * Parses MIME (or RFC822) message streams of bytes or characters and reports
33  * parsing events to a <code>ContentHandler</code> instance.
34  * </p>
35  * <p>
36  * Typical usage:<br/>
37  * <pre>
38  *      ContentHandler handler = new MyHandler();
39  *      MimeStreamParser parser = new MimeStreamParser();
40  *      parser.setContentHandler(handler);
41  *      parser.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
42  * </pre>
43  * <strong>NOTE:</strong> All lines must end with CRLF
44  * (<code>\r\n</code>). If you are unsure of the line endings in your stream
45  * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance.
46  *
47  *
48  * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
49  */
50 public class MimeStreamParser {
51     private static final Log log = LogFactory.getLog(MimeStreamParser.class);
52 
53     private static BitSet fieldChars = null;
54 
55     private RootInputStream rootStream = null;
56     private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>();
57     private ContentHandler handler = null;
58     private boolean raw = false;
59     private boolean prematureEof = false;
60 
61     static {
62         fieldChars = new BitSet();
63         for (int i = 0x21; i <= 0x39; i++) {
64             fieldChars.set(i);
65         }
66         for (int i = 0x3b; i <= 0x7e; i++) {
67             fieldChars.set(i);
68         }
69     }
70 
71     /**
72      * Creates a new <code>MimeStreamParser</code> instance.
73      */
MimeStreamParser()74     public MimeStreamParser() {
75     }
76 
77     /**
78      * Parses a stream of bytes containing a MIME message.
79      *
80      * @param is the stream to parse.
81      * @throws IOException on I/O errors.
82      */
parse(InputStream is)83     public void parse(InputStream is) throws IOException {
84         rootStream = new RootInputStream(is);
85         parseMessage(rootStream);
86     }
87 
88     /**
89      * Determines if this parser is currently in raw mode.
90      *
91      * @return <code>true</code> if in raw mode, <code>false</code>
92      *         otherwise.
93      * @see #setRaw(boolean)
94      */
isRaw()95     public boolean isRaw() {
96         return raw;
97     }
98 
99     /**
100      * Enables or disables raw mode. In raw mode all future entities
101      * (messages or body parts) in the stream will be reported to the
102      * {@link ContentHandler#raw(InputStream)} handler method only.
103      * The stream will contain the entire unparsed entity contents
104      * including header fields and whatever is in the body.
105      *
106      * @param raw <code>true</code> enables raw mode, <code>false</code>
107      *        disables it.
108      */
setRaw(boolean raw)109     public void setRaw(boolean raw) {
110         this.raw = raw;
111     }
112 
113     /**
114      * Finishes the parsing and stops reading lines.
115      * NOTE: No more lines will be parsed but the parser
116      * will still call
117      * {@link ContentHandler#endMultipart()},
118      * {@link ContentHandler#endBodyPart()},
119      * {@link ContentHandler#endMessage()}, etc to match previous calls
120      * to
121      * {@link ContentHandler#startMultipart(BodyDescriptor)},
122      * {@link ContentHandler#startBodyPart()},
123      * {@link ContentHandler#startMessage()}, etc.
124      */
stop()125     public void stop() {
126         rootStream.truncate();
127     }
128 
129     /**
130      * Parses an entity which consists of a header followed by a body containing
131      * arbitrary data, body parts or an embedded message.
132      *
133      * @param is the stream to parse.
134      * @throws IOException on I/O errors.
135      */
parseEntity(InputStream is)136     private void parseEntity(InputStream is) throws IOException {
137         BodyDescriptor bd = parseHeader(is);
138 
139         if (bd.isMultipart()) {
140             bodyDescriptors.addFirst(bd);
141 
142             handler.startMultipart(bd);
143 
144             MimeBoundaryInputStream tempIs =
145                 new MimeBoundaryInputStream(is, bd.getBoundary());
146             handler.preamble(new CloseShieldInputStream(tempIs));
147             tempIs.consume();
148 
149             while (tempIs.hasMoreParts()) {
150                 tempIs = new MimeBoundaryInputStream(is, bd.getBoundary());
151                 parseBodyPart(tempIs);
152                 tempIs.consume();
153                 if (tempIs.parentEOF()) {
154                     prematureEof = true;
155 //                    if (log.isWarnEnabled()) {
156 //                        log.warn("Line " + rootStream.getLineNumber()
157 //                                + ": Body part ended prematurely. "
158 //                                + "Higher level boundary detected or "
159 //                                + "EOF reached.");
160 //                    }
161                     break;
162                 }
163             }
164 
165             handler.epilogue(new CloseShieldInputStream(is));
166 
167             handler.endMultipart();
168 
169             bodyDescriptors.removeFirst();
170 
171         } else if (bd.isMessage()) {
172             if (bd.isBase64Encoded()) {
173                 log.warn("base64 encoded message/rfc822 detected");
174                 is = new EOLConvertingInputStream(
175                         new Base64InputStream(is));
176             } else if (bd.isQuotedPrintableEncoded()) {
177                 log.warn("quoted-printable encoded message/rfc822 detected");
178                 is = new EOLConvertingInputStream(
179                         new QuotedPrintableInputStream(is));
180             }
181             bodyDescriptors.addFirst(bd);
182             parseMessage(is);
183             bodyDescriptors.removeFirst();
184         } else {
185             handler.body(bd, new CloseShieldInputStream(is));
186         }
187 
188         /*
189          * Make sure the stream has been consumed.
190          */
191         while (is.read() != -1) {
192         }
193     }
194 
parseMessage(InputStream is)195     private void parseMessage(InputStream is) throws IOException {
196         if (raw) {
197             handler.raw(new CloseShieldInputStream(is));
198         } else {
199             handler.startMessage();
200             parseEntity(is);
201             handler.endMessage();
202         }
203     }
204 
getPrematureEof()205     public boolean getPrematureEof() {
206         return prematureEof;
207     }
208 
parseBodyPart(InputStream is)209     private void parseBodyPart(InputStream is) throws IOException {
210         if (raw) {
211             handler.raw(new CloseShieldInputStream(is));
212         } else {
213             handler.startBodyPart();
214             parseEntity(is);
215             handler.endBodyPart();
216         }
217     }
218 
219     /**
220      * Parses a header.
221      *
222      * @param is the stream to parse.
223      * @return a <code>BodyDescriptor</code> describing the body following
224      *         the header.
225      */
parseHeader(InputStream is)226     private BodyDescriptor parseHeader(InputStream is) throws IOException {
227         BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty()
228                         ? null : (BodyDescriptor) bodyDescriptors.getFirst());
229 
230         handler.startHeader();
231 
232         int lineNumber = rootStream.getLineNumber();
233 
234         StringBuffer sb = new StringBuffer();
235         int curr = 0;
236         int prev = 0;
237         while ((curr = is.read()) != -1) {
238             if (curr == '\n' && (prev == '\n' || prev == 0)) {
239                 /*
240                  * [\r]\n[\r]\n or an immediate \r\n have been seen.
241                  */
242                 sb.deleteCharAt(sb.length() - 1);
243                 break;
244             }
245             sb.append((char) curr);
246             prev = curr == '\r' ? prev : curr;
247         }
248 
249 //        if (curr == -1 && log.isWarnEnabled()) {
250 //            log.warn("Line " + rootStream.getLineNumber()
251 //                    + ": Unexpected end of headers detected. "
252 //                    + "Boundary detected in header or EOF reached.");
253 //        }
254 
255         int start = 0;
256         int pos = 0;
257         int startLineNumber = lineNumber;
258         while (pos < sb.length()) {
259             while (pos < sb.length() && sb.charAt(pos) != '\r') {
260                 pos++;
261             }
262             if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') {
263                 pos++;
264                 continue;
265             }
266 
267             if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) {
268 
269                 /*
270                  * field should be the complete field data excluding the
271                  * trailing \r\n.
272                  */
273                 String field = sb.substring(start, pos);
274                 start = pos + 2;
275 
276                 /*
277                  * Check for a valid field.
278                  */
279                 int index = field.indexOf(':');
280                 boolean valid = false;
281                 if (index != -1 && fieldChars.get(field.charAt(0))) {
282                     valid = true;
283                     String fieldName = field.substring(0, index).trim();
284                     for (int i = 0; i < fieldName.length(); i++) {
285                         if (!fieldChars.get(fieldName.charAt(i))) {
286                             valid = false;
287                             break;
288                         }
289                     }
290 
291                     if (valid) {
292                         handler.field(field);
293                         bd.addField(fieldName, field.substring(index + 1));
294                     }
295                 }
296 
297                 if (!valid && log.isWarnEnabled()) {
298                     log.warn("Line " + startLineNumber
299                             + ": Ignoring invalid field: '" + field.trim() + "'");
300                 }
301 
302                 startLineNumber = lineNumber;
303             }
304 
305             pos += 2;
306             lineNumber++;
307         }
308 
309         handler.endHeader();
310 
311         return bd;
312     }
313 
314     /**
315      * Sets the <code>ContentHandler</code> to use when reporting
316      * parsing events.
317      *
318      * @param h the <code>ContentHandler</code>.
319      */
setContentHandler(ContentHandler h)320     public void setContentHandler(ContentHandler h) {
321         this.handler = h;
322     }
323 
324 }
325