1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.opendocument;
18
19 import java.io.InputStream;
20
21 import junit.framework.TestCase;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.sax.BodyContentHandler;
25 import org.xml.sax.ContentHandler;
26
27 public class OpenOfficeParserTest extends TestCase {
28
29 public void testXMLParser() throws Exception {
30 InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
31 "/test-documents/testOpenOffice2.odt");
32 try {
33 Metadata metadata = new Metadata();
34 ContentHandler handler = new BodyContentHandler();
35 new OpenOfficeParser().parse(input, handler, metadata);
36
37 assertEquals(
38 "application/vnd.oasis.opendocument.text",
39 metadata.get(Metadata.CONTENT_TYPE));
40 assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
41 assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
42 assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
43 assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
44 assertEquals(
45 "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
46 metadata.get("generator"));
47 assertEquals("0", metadata.get("nbTab"));
48 assertEquals("0", metadata.get("nbObject"));
49 assertEquals("0", metadata.get("nbImg"));
50 assertEquals("1", metadata.get("nbPage"));
51 assertEquals("1", metadata.get("nbPara"));
52 assertEquals("14", metadata.get("nbWord"));
53 assertEquals("78", metadata.get("nbCharacter"));
54
55
56 assertEquals(null, metadata.get("custom:Info 1"));
57 assertEquals(null, metadata.get("custom:Info 2"));
58 assertEquals(null, metadata.get("custom:Info 3"));
59 assertEquals(null, metadata.get("custom:Info 4"));
60
61 String content = handler.toString();
62 assertTrue(content.contains(
63 "This is a sample Open Office document,"
64 + " written in NeoOffice 2.2.1 for the Mac."));
65 } finally {
66 input.close();
67 }
68 }
69
70
71
72
73
74 public void testOO2Metadata() throws Exception {
75 InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
76 "/test-documents/testOpenOffice2.odf");
77 try {
78 Metadata metadata = new Metadata();
79 ContentHandler handler = new BodyContentHandler();
80 new OpenOfficeParser().parse(input, handler, metadata);
81
82 assertEquals(
83 "application/vnd.oasis.opendocument.formula",
84 metadata.get(Metadata.CONTENT_TYPE));
85 assertEquals(null, metadata.get(Metadata.DATE));
86 assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
87 assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
88 assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
89 assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
90 assertEquals("1", metadata.get("editing-cycles"));
91 assertEquals(
92 "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
93 metadata.get("generator"));
94 assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
95
96
97 assertEquals("Text 1", metadata.get("custom:Info 1"));
98 assertEquals("2", metadata.get("custom:Info 2"));
99 assertEquals("false", metadata.get("custom:Info 3"));
100 assertEquals("true", metadata.get("custom:Info 4"));
101
102
103 assertEquals(null, metadata.get("nbTab"));
104 assertEquals(null, metadata.get("nbObject"));
105 assertEquals(null, metadata.get("nbImg"));
106 assertEquals(null, metadata.get("nbPage"));
107 assertEquals(null, metadata.get("nbPara"));
108 assertEquals(null, metadata.get("nbWord"));
109 assertEquals(null, metadata.get("nbCharacter"));
110
111
112 String content = handler.toString();
113 assertEquals("", content);
114 } finally {
115 input.close();
116 }
117 }
118
119
120
121
122 public void testOO3Metadata() throws Exception {
123 InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
124 "/test-documents/testODFwithOOo3.odt");
125 try {
126 Metadata metadata = new Metadata();
127 ContentHandler handler = new BodyContentHandler();
128 new OpenOfficeParser().parse(input, handler, metadata);
129
130 assertEquals(
131 "application/vnd.oasis.opendocument.text",
132 metadata.get(Metadata.CONTENT_TYPE));
133 assertEquals("2009-10-05T21:22:38", metadata.get(Metadata.DATE));
134 assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
135 assertEquals("Apache Tika", metadata.get(Metadata.TITLE));
136 assertEquals("Test document", metadata.get(Metadata.SUBJECT));
137 assertEquals("A rather complex document", metadata.get(Metadata.DESCRIPTION));
138 assertEquals("Bart Hanssens", metadata.get(Metadata.CREATOR));
139 assertEquals("Bart Hanssens", metadata.get("initial-creator"));
140 assertEquals("2", metadata.get("editing-cycles"));
141 assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
142 assertEquals(
143 "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
144 metadata.get("generator"));
145 assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
146
147
148 assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
149 assertEquals(null, metadata.get("custom:Info 2"));
150 assertEquals(null, metadata.get("custom:Info 3"));
151 assertEquals(null, metadata.get("custom:Info 4"));
152
153
154 assertEquals("0", metadata.get("nbTab"));
155 assertEquals("2", metadata.get("nbObject"));
156 assertEquals("0", metadata.get("nbImg"));
157 assertEquals("2", metadata.get("nbPage"));
158 assertEquals("13", metadata.get("nbPara"));
159 assertEquals("54", metadata.get("nbWord"));
160 assertEquals("351", metadata.get("nbCharacter"));
161
162 String content = handler.toString();
163 assertTrue(content.contains(
164 "Apache Tika Tika is part of the Lucene project."
165 ));
166 } finally {
167 input.close();
168 }
169 }
170 }