View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.odf;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.util.Arrays;
22  import java.util.Collections;
23  import java.util.HashSet;
24  import java.util.Set;
25  import java.util.zip.ZipEntry;
26  import java.util.zip.ZipInputStream;
27  
28  import org.apache.tika.exception.TikaException;
29  import org.apache.tika.io.IOUtils;
30  import org.apache.tika.metadata.Metadata;
31  import org.apache.tika.mime.MediaType;
32  import org.apache.tika.parser.ParseContext;
33  import org.apache.tika.parser.Parser;
34  import org.xml.sax.ContentHandler;
35  import org.xml.sax.SAXException;
36  import org.xml.sax.helpers.DefaultHandler;
37  
38  /**
39   * OpenOffice parser
40   */
41  public class OpenDocumentParser implements Parser {
42  
43      private static final Set<MediaType> SUPPORTED_TYPES =
44          Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
45                  MediaType.application("vnd.sun.xml.writer"),
46                  MediaType.application("vnd.oasis.opendocument.text"),
47                  MediaType.application("vnd.oasis.opendocument.graphics"),
48                  MediaType.application("vnd.oasis.opendocument.presentation"),
49                  MediaType.application("vnd.oasis.opendocument.spreadsheet"),
50                  MediaType.application("vnd.oasis.opendocument.chart"),
51                  MediaType.application("vnd.oasis.opendocument.image"),
52                  MediaType.application("vnd.oasis.opendocument.formula"),
53                  MediaType.application("vnd.oasis.opendocument.text-master"),
54                  MediaType.application("vnd.oasis.opendocument.text-web"),
55                  MediaType.application("vnd.oasis.opendocument.text-template"),
56                  MediaType.application("vnd.oasis.opendocument.graphics-template"),
57                  MediaType.application("vnd.oasis.opendocument.presentation-template"),
58                  MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
59                  MediaType.application("vnd.oasis.opendocument.chart-template"),
60                  MediaType.application("vnd.oasis.opendocument.image-template"),
61                  MediaType.application("vnd.oasis.opendocument.formula-template"),
62                  MediaType.application("x-vnd.oasis.opendocument.text"),
63                  MediaType.application("x-vnd.oasis.opendocument.graphics"),
64                  MediaType.application("x-vnd.oasis.opendocument.presentation"),
65                  MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
66                  MediaType.application("x-vnd.oasis.opendocument.chart"),
67                  MediaType.application("x-vnd.oasis.opendocument.image"),
68                  MediaType.application("x-vnd.oasis.opendocument.formula"),
69                  MediaType.application("x-vnd.oasis.opendocument.text-master"),
70                  MediaType.application("x-vnd.oasis.opendocument.text-web"),
71                  MediaType.application("x-vnd.oasis.opendocument.text-template"),
72                  MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
73                  MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
74                  MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
75                  MediaType.application("x-vnd.oasis.opendocument.chart-template"),
76                  MediaType.application("x-vnd.oasis.opendocument.image-template"),
77                  MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
78  
79      private Parser meta = new OpenDocumentMetaParser();
80  
81      private Parser content = new OpenDocumentContentParser();
82  
83      public Parser getMetaParser() {
84          return meta;
85      }
86  
87      public void setMetaParser(Parser meta) {
88          this.meta = meta;
89      }
90  
91      public Parser getContentParser() {
92          return content;
93      }
94  
95      public void setContentParser(Parser content) {
96          this.content = content;
97      }
98  
99      public Set<MediaType> getSupportedTypes(ParseContext context) {
100         return SUPPORTED_TYPES;
101     }
102 
103     public void parse(
104             InputStream stream, ContentHandler handler,
105             Metadata metadata, ParseContext context)
106             throws IOException, SAXException, TikaException {
107         ZipInputStream zip = new ZipInputStream(stream);
108         ZipEntry entry = zip.getNextEntry();
109         while (entry != null) {
110             if (entry.getName().equals("mimetype")) {
111                 String type = IOUtils.toString(zip, "UTF-8");
112                 metadata.set(Metadata.CONTENT_TYPE, type);
113             } else if (entry.getName().equals("meta.xml")) {
114                 meta.parse(zip, new DefaultHandler(), metadata, context);
115             } else if (entry.getName().endsWith("content.xml")) {
116                 content.parse(zip, handler, metadata, context);
117             }
118             entry = zip.getNextEntry();
119         }
120     }
121 
122     /**
123      * @deprecated This method will be removed in Apache Tika 1.0.
124      */
125     public void parse(
126             InputStream stream, ContentHandler handler, Metadata metadata)
127             throws IOException, SAXException, TikaException {
128         parse(stream, handler, metadata, new ParseContext());
129     }
130 
131 }