View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.gui;
18  
19  import java.awt.Dimension;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.PrintWriter;
23  import java.io.StringWriter;
24  import java.io.Writer;
25  import java.util.Arrays;
26  
27  import javax.swing.JEditorPane;
28  import javax.swing.JFrame;
29  import javax.swing.JOptionPane;
30  import javax.swing.JScrollPane;
31  import javax.swing.JTabbedPane;
32  import javax.swing.ProgressMonitorInputStream;
33  import javax.swing.SwingUtilities;
34  import javax.swing.UIManager;
35  import javax.xml.transform.OutputKeys;
36  import javax.xml.transform.TransformerConfigurationException;
37  import javax.xml.transform.sax.SAXTransformerFactory;
38  import javax.xml.transform.sax.TransformerHandler;
39  import javax.xml.transform.stream.StreamResult;
40  
41  import org.apache.tika.metadata.Metadata;
42  import org.apache.tika.parser.AutoDetectParser;
43  import org.apache.tika.parser.ParseContext;
44  import org.apache.tika.parser.Parser;
45  import org.apache.tika.sax.BodyContentHandler;
46  import org.apache.tika.sax.ContentHandlerDecorator;
47  import org.apache.tika.sax.TeeContentHandler;
48  import org.apache.tika.sax.XHTMLContentHandler;
49  import org.xml.sax.Attributes;
50  import org.xml.sax.ContentHandler;
51  import org.xml.sax.SAXException;
52  
53  /**
54   * Simple Swing GUI for Apache Tika. You can drag and drop files on top
55   * of the window to have them parsed.
56   */
57  public class TikaGUI extends JFrame {
58  
59      /**
60       * Serial version UID.
61       */
62      private static final long serialVersionUID = 5883906936187059495L;
63  
64      /**
65       * Main method. Sets the Swing look and feel to the operating system
66       * settings, and starts the Tika GUI with an {@link AutoDetectParser}
67       * instance as the default parser.
68       *
69       * @param args ignored
70       * @throws Exception if an error occurs
71       */
72      public static void main(String[] args) throws Exception {
73          UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
74          SwingUtilities.invokeLater(new Runnable() {
75              public void run() {
76                  new TikaGUI(new AutoDetectParser()).setVisible(true);
77              }
78          });
79      }
80  
81      /**
82       * Parsing context.
83       */
84      private final ParseContext context;
85  
86      /**
87       * Configured parser instance.
88       */
89      private final Parser parser;
90  
91      /**
92       * Tabs in the Tika GUI window.
93       */
94      private final JTabbedPane tabs;
95  
96      /**
97       * Formatted XHTML output.
98       */
99      private final JEditorPane html;
100 
101     /**
102      * Plain text output.
103      */
104     private final JEditorPane text;
105 
106     /**
107      * Raw XHTML source.
108      */
109     private final JEditorPane xml;
110 
111     /**
112      * Document metadata.
113      */
114     private final JEditorPane metadata;
115 
116     /**
117      * Parsing errors.
118      */
119     private final JEditorPane errors;
120 
121     public TikaGUI(Parser parser) {
122         super("Apache Tika");
123         setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
124 
125         tabs = new JTabbedPane();
126         add(tabs);
127 
128         html = createEditor("Formatted text", "text/html");
129         text = createEditor("Plain text", "text/plain");
130         xml = createEditor("Structured text", "text/plain");
131         metadata = createEditor("Metadata", "text/plain");
132         errors = createEditor("Errors", "text/plain");
133 
134         setPreferredSize(new Dimension(500, 400));
135         pack();
136 
137         this.context = new ParseContext();
138         this.parser = parser;
139         this.context.set(Parser.class, parser);
140     }
141 
142    public void importStream(InputStream input, Metadata md)
143            throws IOException {
144         try {
145             StringWriter htmlBuffer = new StringWriter();
146             StringWriter textBuffer = new StringWriter();
147             StringWriter xmlBuffer = new StringWriter();
148             StringBuilder metadataBuffer = new StringBuilder();
149 
150             ContentHandler handler = new TeeContentHandler(
151                     getHtmlHandler(htmlBuffer),
152                     getTextContentHandler(textBuffer),
153                     getXmlContentHandler(xmlBuffer));
154 
155             input = new ProgressMonitorInputStream(
156                     this, "Parsing stream", input);
157             parser.parse(input, handler, md, context);
158 
159             String[] names = md.names();
160             Arrays.sort(names);
161             for (String name : names) {
162                 metadataBuffer.append(name);
163                 metadataBuffer.append(": ");
164                 metadataBuffer.append(md.get(name));
165                 metadataBuffer.append("\n");
166             }
167 
168             setText(errors, "");
169             setText(metadata, metadataBuffer.toString());
170             setText(xml, xmlBuffer.toString());
171             setText(text, textBuffer.toString());
172             setText(html, htmlBuffer.toString());
173             tabs.setSelectedIndex(0);
174         } catch (Exception e) {
175             StringWriter writer = new StringWriter();
176             e.printStackTrace(new PrintWriter(writer));
177             setText(errors, writer.toString());
178             setText(metadata, "");
179             setText(xml, "");
180             setText(text, "");
181             setText(html, "");
182             tabs.setSelectedIndex(tabs.getTabCount() - 1);
183             JOptionPane.showMessageDialog(
184                     this,
185                     "Apache Tika was unable to parse the file or url.\n "
186                     + " See the errors tab for"
187                     + " the detailed stack trace of this error.",
188                     "Parse error",
189                     JOptionPane.ERROR_MESSAGE);
190         } finally {
191             input.close();
192         }
193     }
194 
195     private JEditorPane createEditor(String title, String type) {
196         JEditorPane editor = new JEditorPane();
197         editor.setContentType(type);
198         editor.setTransferHandler(new ParsingTransferHandler(
199                 editor.getTransferHandler(), this));
200         tabs.add(title, new JScrollPane(editor));
201         return editor;
202     }
203 
204     private void setText(JEditorPane editor, String text) {
205         editor.setText(text);
206         editor.setCaretPosition(0);
207     }
208 
209     /**
210      * Creates and returns a content handler that turns XHTML input to
211      * simplified HTML output that can be correctly parsed and displayed
212      * by {@link JEditorPane}.
213      * <p>
214      * The returned content handler is set to output <code>html</code>
215      * to the given writer. The XHTML namespace is removed from the output
216      * to prevent the serializer from using the &lt;tag/&gt; empty element
217      * syntax that causes extra "&gt;" characters to be displayed.
218      * The &lt;head&gt; tags are dropped to prevent the serializer from
219      * generating a &lt;META&gt; content type tag that makes
220      * {@link JEditorPane} fail thinking that the document character set
221      * is inconsistent.
222      *
223      * @param writer output writer
224      * @return HTML content handler
225      * @throws TransformerConfigurationException if an error occurs
226      */
227     private ContentHandler getHtmlHandler(Writer writer)
228             throws TransformerConfigurationException {
229         SAXTransformerFactory factory = (SAXTransformerFactory)
230             SAXTransformerFactory.newInstance();
231         TransformerHandler handler = factory.newTransformerHandler();
232         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
233         handler.setResult(new StreamResult(writer));
234         return new ContentHandlerDecorator(handler) {
235             @Override
236             public void startElement(
237                     String uri, String localName, String name, Attributes atts)
238                     throws SAXException {
239                 if (XHTMLContentHandler.XHTML.equals(uri)) {
240                     uri = null;
241                 }
242                 if (!"head".equals(localName)) {
243                     super.startElement(uri, localName, name, atts);
244                 }
245             }
246             @Override
247             public void endElement(String uri, String localName, String name)
248                     throws SAXException {
249                 if (XHTMLContentHandler.XHTML.equals(uri)) {
250                     uri = null;
251                 }
252                 if (!"head".equals(localName)) {
253                     super.endElement(uri, localName, name);
254                 }
255             }
256             @Override
257             public void startPrefixMapping(String prefix, String uri) {
258             }
259             @Override
260             public void endPrefixMapping(String prefix) {
261             }
262         };
263     }
264 
265     private ContentHandler getTextContentHandler(Writer writer) {
266         return new BodyContentHandler(writer);
267     }
268 
269     private ContentHandler getXmlContentHandler(Writer writer)
270             throws TransformerConfigurationException {
271         SAXTransformerFactory factory = (SAXTransformerFactory)
272             SAXTransformerFactory.newInstance();
273         TransformerHandler handler = factory.newTransformerHandler();
274         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
275         handler.setResult(new StreamResult(writer));
276         return handler;
277     }
278 
279 }