View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.mbox;
18  
19  import java.io.BufferedReader;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.io.UnsupportedEncodingException;
24  import java.util.Collections;
25  import java.util.Set;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  import org.apache.log4j.Logger;
30  import org.apache.tika.exception.TikaException;
31  import org.apache.tika.metadata.Metadata;
32  import org.apache.tika.mime.MediaType;
33  import org.apache.tika.parser.ParseContext;
34  import org.apache.tika.parser.Parser;
35  import org.apache.tika.sax.XHTMLContentHandler;
36  import org.xml.sax.ContentHandler;
37  import org.xml.sax.SAXException;
38  
39  /**
40   * Mbox (mailbox) parser. This version returns the headers for the first email
41   * via metadata, which means headers from subsequent emails will be lost.
42   */
43  public class MboxParser implements Parser {
44  
45      private static final Logger LOGGER = Logger.getLogger(MboxParser.class);
46  
47      private static final Set<MediaType> SUPPORTED_TYPES =
48          Collections.singleton(MediaType.application("mbox"));
49  
50      public static final String MBOX_MIME_TYPE = "application/mbox";
51      public static final String MBOX_RECORD_DIVIDER = "From ";
52      private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
53  
54      private static final String EMAIL_HEADER_METADATA_PREFIX = MboxParser.class.getSimpleName() + "-";
55      private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
56  
57      private enum ParseStates {
58          START, IN_HEADER, IN_CONTENT
59      }
60  
61      public Set<MediaType> getSupportedTypes(ParseContext context) {
62          return SUPPORTED_TYPES;
63      }
64  
65      public void parse(
66              InputStream stream, ContentHandler handler,
67              Metadata metadata, ParseContext context)
68              throws IOException, TikaException, SAXException {
69  
70          InputStreamReader isr;
71          try {
72              // Headers are going to be 7-bit ascii
73              isr = new InputStreamReader(stream, "us-ascii");
74          } catch (UnsupportedEncodingException e) {
75              LOGGER.error("Unexpected exception setting up MboxParser", e);
76              isr = new InputStreamReader(stream);
77          }
78  
79          BufferedReader reader = new BufferedReader(isr);
80  
81          metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
82          metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
83  
84          XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
85          xhtml.startDocument();
86  
87          ParseStates parseState = ParseStates.START;
88          String multiLine = null;
89          boolean inQuote = false;
90          int numEmails = 0;
91  
92          // We're going to scan, line-by-line, for a line that starts with
93          // "From "
94          for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
95              boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
96              if (newMessage) {
97                  numEmails += 1;
98              }
99  
100             switch (parseState) {
101             case START:
102                 if (newMessage) {
103                     parseState = ParseStates.IN_HEADER;
104                     newMessage = false;
105                     // Fall through to IN_HEADER
106                 } else {
107                     break;
108                 }
109 
110             case IN_HEADER:
111                 if (newMessage) {
112                     saveHeaderInMetadata(numEmails, metadata, multiLine);
113                     multiLine = curLine;
114                 } else if (curLine.length() == 0) {
115                     // Blank line is signal that we're transitioning to the content.
116                     saveHeaderInMetadata(numEmails, metadata, multiLine);
117                     parseState = ParseStates.IN_CONTENT;
118 
119                     // Mimic what PackageParser does between entries.
120                     xhtml.startElement("div", "class", "email-entry");
121                     xhtml.startElement("p");
122                     inQuote = false;
123                 } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
124                     multiLine += " " + curLine.trim();
125                 } else {
126                     saveHeaderInMetadata(numEmails, metadata, multiLine);
127                     multiLine = curLine;
128                 }
129 
130                 break;
131 
132                 // TODO - use real email parsing support so we can correctly handle
133                 // things like multipart messages and quoted-printable encoding.
134                 // We'd also want this for charset handling, where content isn't 7-bit
135                 // ascii.
136             case IN_CONTENT:
137                 if (newMessage) {
138                     endMessage(xhtml, inQuote);
139                     parseState = ParseStates.IN_HEADER;
140                     multiLine = curLine;
141                 } else {
142                     boolean quoted = curLine.startsWith(">");
143                     if (inQuote) {
144                         if (!quoted) {
145                             xhtml.endElement("q");
146                             inQuote = false;
147                         }
148                     } else if (quoted) {
149                         xhtml.startElement("q");
150                         inQuote = true;
151                     }
152 
153                     xhtml.characters(curLine);
154 
155                     // For plain text email, each line is a real break position.
156                     xhtml.element("br", "");
157                 }
158             }
159         }
160 
161         if (parseState == ParseStates.IN_HEADER) {
162             saveHeaderInMetadata(numEmails, metadata, multiLine);
163         } else if (parseState == ParseStates.IN_CONTENT) {
164             endMessage(xhtml, inQuote);
165         }
166 
167         xhtml.endDocument();
168     }
169 
170     private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
171         if (inQuote) {
172             xhtml.endElement("q");
173         }
174 
175         xhtml.endElement("p");
176         xhtml.endElement("div");
177     }
178 
179     private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
180         if ((curLine == null) || (numEmails > 1)) {
181             return;
182         } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
183             metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
184             return;
185         }
186 
187         Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
188         if (!headerMatcher.matches()) {
189             LOGGER.warn("Malformed email header in mbox file: " + curLine);
190             return;
191         }
192 
193         String headerTag = headerMatcher.group(1).toLowerCase();
194         String headerContent = headerMatcher.group(2);
195 
196         if (headerTag.equalsIgnoreCase("From")) {
197             metadata.add(Metadata.AUTHOR, headerContent);
198             metadata.add(Metadata.CREATOR, headerContent);
199         } else if (headerTag.equalsIgnoreCase("Subject")) {
200             metadata.add(Metadata.SUBJECT, headerContent);
201             metadata.add(Metadata.TITLE, headerContent);
202         } else if (headerTag.equalsIgnoreCase("Date")) {
203             // TODO - parse and convert to ISO format YYYY-MM-DD
204             metadata.add(Metadata.DATE, headerContent);
205         } else if (headerTag.equalsIgnoreCase("Message-Id")) {
206             metadata.add(Metadata.IDENTIFIER, headerContent);
207         } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
208             metadata.add(Metadata.RELATION, headerContent);
209         } else if (headerTag.equalsIgnoreCase("Content-Type")) {
210             // TODO - key off content-type in headers to
211             // set mapping to use for content and convert if necessary.
212 
213             metadata.add(Metadata.CONTENT_TYPE, headerContent);
214             metadata.add(Metadata.FORMAT, headerContent);
215         } else {
216             metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
217         }
218     }
219 
220     public void parse(
221             InputStream stream, ContentHandler handler, Metadata metadata)
222             throws IOException, SAXException, TikaException {
223         parse(stream, handler, metadata, new ParseContext());
224     }
225 
226 }