1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.io.InputStream;
20  import java.util.regex.Matcher;
21  import java.util.regex.Pattern;
22  
23  import junit.framework.TestCase;
24  
25  import org.apache.tika.metadata.Metadata;
26  import org.apache.tika.parser.AutoDetectParser;
27  import org.apache.tika.parser.Parser;
28  import org.apache.tika.sax.BodyContentHandler;
29  import org.xml.sax.ContentHandler;
30  
31  /**
32   * Test case for parsing Outlook files.
33   */
34  public class OutlookParserTest extends TestCase {
35  
36      public void testOutlookParsing() throws Exception {
37          Parser parser = new AutoDetectParser(); // Should auto-detect!
38          ContentHandler handler = new BodyContentHandler();
39          Metadata metadata = new Metadata();
40  
41          InputStream stream = OutlookParserTest.class.getResourceAsStream(
42                  "/test-documents/test-outlook.msg");
43          try {
44              parser.parse(stream, handler, metadata);
45          } finally {
46              stream.close();
47          }
48  
49          assertEquals(
50                  "application/vnd.ms-outlook",
51                  metadata.get(Metadata.CONTENT_TYPE));
52          assertEquals(
53                  "Microsoft Outlook Express 6",
54                  metadata.get(Metadata.TITLE));
55          // TODO: There's apparently some encoding issue in POI
56          //assertEquals(
57          //        "L'\u00C9quipe Microsoft Outlook Express",
58          //        metadata.get(Metadata.AUTHOR));
59  
60          String content = handler.toString();
61          assertTrue(content.contains("Microsoft Outlook Express 6"));
62          //assertTrue(content.contains("L'\u00C9quipe Microsoft Outlook Express"));
63          assertTrue(content.contains("Nouvel utilisateur de Outlook Express"));
64          assertTrue(content.contains("Messagerie et groupes de discussion"));
65      }
66  
67      /**
68       * Test case for TIKA-197
69       *
70       * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
71       */
72      public void testMultipleCopies() throws Exception {
73          Parser parser = new AutoDetectParser();
74          ContentHandler handler = new BodyContentHandler();
75          Metadata metadata = new Metadata();
76  
77          InputStream stream = OutlookParserTest.class.getResourceAsStream(
78                  "/test-documents/testMSG.msg");
79          try {
80              parser.parse(stream, handler, metadata);
81          } finally {
82              stream.close();
83          }
84  
85          assertEquals(
86                  "application/vnd.ms-outlook",
87                  metadata.get(Metadata.CONTENT_TYPE));
88  
89          String content = handler.toString();
90          Pattern pattern = Pattern.compile("From");
91          Matcher matcher = pattern.matcher(content);
92          assertTrue(matcher.find());
93          assertFalse(matcher.find());
94      }
95  
96      /**
97       * Test case for TIKA-395, to ensure parser works for new Outlook formats. 
98       *
99       * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
100      */
101     public void testOutlookNew() throws Exception {
102         Parser parser = new AutoDetectParser();
103         ContentHandler handler = new BodyContentHandler();
104         Metadata metadata = new Metadata();
105 
106         InputStream stream = OutlookParserTest.class.getResourceAsStream(
107                 "/test-documents/test-outlook2003.msg");
108         try {
109             parser.parse(stream, handler, metadata);
110         } finally {
111             stream.close();
112         }
113 
114         assertEquals(
115                 "application/vnd.ms-outlook",
116                 metadata.get(Metadata.CONTENT_TYPE));
117         assertEquals(
118                 "Welcome to Microsoft Office Outlook 2003",
119                 metadata.get(Metadata.TITLE));
120 
121         String content = handler.toString();
122         assertTrue(content.contains("Outlook 2003"));
123         assertTrue(content.contains("Streamlined Mail Experience"));
124         assertTrue(content.contains("Navigation Pane"));
125     }
126 
127 }