1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.mbox;
18
19 import java.io.BufferedReader;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.io.UnsupportedEncodingException;
24 import java.util.Collections;
25 import java.util.Set;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.log4j.Logger;
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.parser.Parser;
35 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39
40
41
42
43 public class MboxParser implements Parser {
44
45 private static final Logger LOGGER = Logger.getLogger(MboxParser.class);
46
47 private static final Set<MediaType> SUPPORTED_TYPES =
48 Collections.singleton(MediaType.application("mbox"));
49
50 public static final String MBOX_MIME_TYPE = "application/mbox";
51 public static final String MBOX_RECORD_DIVIDER = "From ";
52 private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
53
54 private static final String EMAIL_HEADER_METADATA_PREFIX = MboxParser.class.getSimpleName() + "-";
55 private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
56
57 private enum ParseStates {
58 START, IN_HEADER, IN_CONTENT
59 }
60
61 public Set<MediaType> getSupportedTypes(ParseContext context) {
62 return SUPPORTED_TYPES;
63 }
64
65 public void parse(
66 InputStream stream, ContentHandler handler,
67 Metadata metadata, ParseContext context)
68 throws IOException, TikaException, SAXException {
69
70 InputStreamReader isr;
71 try {
72
73 isr = new InputStreamReader(stream, "us-ascii");
74 } catch (UnsupportedEncodingException e) {
75 LOGGER.error("Unexpected exception setting up MboxParser", e);
76 isr = new InputStreamReader(stream);
77 }
78
79 BufferedReader reader = new BufferedReader(isr);
80
81 metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
82 metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
83
84 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
85 xhtml.startDocument();
86
87 ParseStates parseState = ParseStates.START;
88 String multiLine = null;
89 boolean inQuote = false;
90 int numEmails = 0;
91
92
93
94 for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
95 boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
96 if (newMessage) {
97 numEmails += 1;
98 }
99
100 switch (parseState) {
101 case START:
102 if (newMessage) {
103 parseState = ParseStates.IN_HEADER;
104 newMessage = false;
105
106 } else {
107 break;
108 }
109
110 case IN_HEADER:
111 if (newMessage) {
112 saveHeaderInMetadata(numEmails, metadata, multiLine);
113 multiLine = curLine;
114 } else if (curLine.length() == 0) {
115
116 saveHeaderInMetadata(numEmails, metadata, multiLine);
117 parseState = ParseStates.IN_CONTENT;
118
119
120 xhtml.startElement("div", "class", "email-entry");
121 xhtml.startElement("p");
122 inQuote = false;
123 } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
124 multiLine += " " + curLine.trim();
125 } else {
126 saveHeaderInMetadata(numEmails, metadata, multiLine);
127 multiLine = curLine;
128 }
129
130 break;
131
132
133
134
135
136 case IN_CONTENT:
137 if (newMessage) {
138 endMessage(xhtml, inQuote);
139 parseState = ParseStates.IN_HEADER;
140 multiLine = curLine;
141 } else {
142 boolean quoted = curLine.startsWith(">");
143 if (inQuote) {
144 if (!quoted) {
145 xhtml.endElement("q");
146 inQuote = false;
147 }
148 } else if (quoted) {
149 xhtml.startElement("q");
150 inQuote = true;
151 }
152
153 xhtml.characters(curLine);
154
155
156 xhtml.element("br", "");
157 }
158 }
159 }
160
161 if (parseState == ParseStates.IN_HEADER) {
162 saveHeaderInMetadata(numEmails, metadata, multiLine);
163 } else if (parseState == ParseStates.IN_CONTENT) {
164 endMessage(xhtml, inQuote);
165 }
166
167 xhtml.endDocument();
168 }
169
170 private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
171 if (inQuote) {
172 xhtml.endElement("q");
173 }
174
175 xhtml.endElement("p");
176 xhtml.endElement("div");
177 }
178
179 private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
180 if ((curLine == null) || (numEmails > 1)) {
181 return;
182 } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
183 metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
184 return;
185 }
186
187 Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
188 if (!headerMatcher.matches()) {
189 LOGGER.warn("Malformed email header in mbox file: " + curLine);
190 return;
191 }
192
193 String headerTag = headerMatcher.group(1).toLowerCase();
194 String headerContent = headerMatcher.group(2);
195
196 if (headerTag.equalsIgnoreCase("From")) {
197 metadata.add(Metadata.AUTHOR, headerContent);
198 metadata.add(Metadata.CREATOR, headerContent);
199 } else if (headerTag.equalsIgnoreCase("Subject")) {
200 metadata.add(Metadata.SUBJECT, headerContent);
201 metadata.add(Metadata.TITLE, headerContent);
202 } else if (headerTag.equalsIgnoreCase("Date")) {
203
204 metadata.add(Metadata.DATE, headerContent);
205 } else if (headerTag.equalsIgnoreCase("Message-Id")) {
206 metadata.add(Metadata.IDENTIFIER, headerContent);
207 } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
208 metadata.add(Metadata.RELATION, headerContent);
209 } else if (headerTag.equalsIgnoreCase("Content-Type")) {
210
211
212
213 metadata.add(Metadata.CONTENT_TYPE, headerContent);
214 metadata.add(Metadata.FORMAT, headerContent);
215 } else {
216 metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
217 }
218 }
219
220 public void parse(
221 InputStream stream, ContentHandler handler, Metadata metadata)
222 throws IOException, SAXException, TikaException {
223 parse(stream, handler, metadata, new ParseContext());
224 }
225
226 }