1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.opendocument;
18  
19  import java.io.InputStream;
20  
21  import junit.framework.TestCase;
22  
23  import org.apache.tika.metadata.Metadata;
24  import org.apache.tika.sax.BodyContentHandler;
25  import org.xml.sax.ContentHandler;
26  
27  public class OpenOfficeParserTest extends TestCase {
28  
29      public void testXMLParser() throws Exception {
30          InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
31                  "/test-documents/testOpenOffice2.odt");
32          try {
33              Metadata metadata = new Metadata();
34              ContentHandler handler = new BodyContentHandler();
35              new OpenOfficeParser().parse(input, handler, metadata);
36  
37              assertEquals(
38                      "application/vnd.oasis.opendocument.text",
39                      metadata.get(Metadata.CONTENT_TYPE));
40              assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
41              assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
42              assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
43              assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
44              assertEquals(
45                      "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
46                      metadata.get("generator"));
47              assertEquals("0", metadata.get("nbTab"));
48              assertEquals("0", metadata.get("nbObject"));
49              assertEquals("0", metadata.get("nbImg"));
50              assertEquals("1", metadata.get("nbPage"));
51              assertEquals("1", metadata.get("nbPara"));
52              assertEquals("14", metadata.get("nbWord"));
53              assertEquals("78", metadata.get("nbCharacter"));
54              
55              // Custom metadata tags present but without values
56              assertEquals(null, metadata.get("custom:Info 1"));
57              assertEquals(null, metadata.get("custom:Info 2"));
58              assertEquals(null, metadata.get("custom:Info 3"));
59              assertEquals(null, metadata.get("custom:Info 4"));
60  
61              String content = handler.toString();
62              assertTrue(content.contains(
63                      "This is a sample Open Office document,"
64                      + " written in NeoOffice 2.2.1 for the Mac."));
65          } finally {
66              input.close();
67          }
68      }
69  
70      /**
71       * Similar to {@link #testXMLParser()}, but using a different
72       *  OO2 file with different metadata in it
73       */
74      public void testOO2Metadata() throws Exception {
75         InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
76               "/test-documents/testOpenOffice2.odf");
77         try {
78              Metadata metadata = new Metadata();
79              ContentHandler handler = new BodyContentHandler();
80              new OpenOfficeParser().parse(input, handler, metadata);
81     
82              assertEquals(
83                      "application/vnd.oasis.opendocument.formula",
84                      metadata.get(Metadata.CONTENT_TYPE));
85              assertEquals(null, metadata.get(Metadata.DATE));
86              assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
87              assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
88              assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
89              assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
90              assertEquals("1", metadata.get("editing-cycles"));
91              assertEquals(
92                      "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
93                      metadata.get("generator"));
94              assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
95              
96              // User defined metadata
97              assertEquals("Text 1", metadata.get("custom:Info 1"));
98              assertEquals("2", metadata.get("custom:Info 2"));
99              assertEquals("false", metadata.get("custom:Info 3"));
100             assertEquals("true", metadata.get("custom:Info 4"));
101             
102             // No statistics present
103             assertEquals(null, metadata.get("nbTab"));
104             assertEquals(null, metadata.get("nbObject"));
105             assertEquals(null, metadata.get("nbImg"));
106             assertEquals(null, metadata.get("nbPage"));
107             assertEquals(null, metadata.get("nbPara"));
108             assertEquals(null, metadata.get("nbWord"));
109             assertEquals(null, metadata.get("nbCharacter"));
110    
111             // Note - contents of maths files not currently supported
112             String content = handler.toString();
113             assertEquals("", content);
114        } finally {
115            input.close();
116        }
117     }
118 
119     /**
120      * Similar to {@link #testXMLParser()}, but using an OO3 file
121      */
122     public void testOO3Metadata() throws Exception {
123        InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
124              "/test-documents/testODFwithOOo3.odt");
125        try {
126             Metadata metadata = new Metadata();
127             ContentHandler handler = new BodyContentHandler();
128             new OpenOfficeParser().parse(input, handler, metadata);
129    
130             assertEquals(
131                     "application/vnd.oasis.opendocument.text",
132                     metadata.get(Metadata.CONTENT_TYPE));
133             assertEquals("2009-10-05T21:22:38", metadata.get(Metadata.DATE));
134             assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
135             assertEquals("Apache Tika", metadata.get(Metadata.TITLE));
136             assertEquals("Test document", metadata.get(Metadata.SUBJECT));
137             assertEquals("A rather complex document", metadata.get(Metadata.DESCRIPTION));
138             assertEquals("Bart Hanssens", metadata.get(Metadata.CREATOR));
139             assertEquals("Bart Hanssens", metadata.get("initial-creator"));
140             assertEquals("2", metadata.get("editing-cycles"));
141             assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
142             assertEquals(
143                     "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
144                     metadata.get("generator"));
145             assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
146             
147             // User defined metadata
148             assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
149             assertEquals(null, metadata.get("custom:Info 2"));
150             assertEquals(null, metadata.get("custom:Info 3"));
151             assertEquals(null, metadata.get("custom:Info 4"));
152             
153             // No statistics present
154             assertEquals("0", metadata.get("nbTab"));
155             assertEquals("2", metadata.get("nbObject"));
156             assertEquals("0", metadata.get("nbImg"));
157             assertEquals("2", metadata.get("nbPage"));
158             assertEquals("13", metadata.get("nbPara"));
159             assertEquals("54", metadata.get("nbWord"));
160             assertEquals("351", metadata.get("nbCharacter"));
161    
162             String content = handler.toString();
163             assertTrue(content.contains(
164                   "Apache Tika Tika is part of the Lucene project."
165             ));
166        } finally {
167            input.close();
168        }
169     }
170 }