View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.cli;
18  
19  import java.io.File;
20  import java.io.InputStream;
21  import java.io.OutputStreamWriter;
22  import java.io.PrintStream;
23  import java.io.PrintWriter;
24  import java.io.UnsupportedEncodingException;
25  import java.io.Writer;
26  import java.net.URL;
27  import java.util.Arrays;
28  
29  import javax.xml.transform.OutputKeys;
30  import javax.xml.transform.TransformerConfigurationException;
31  import javax.xml.transform.sax.SAXTransformerFactory;
32  import javax.xml.transform.sax.TransformerHandler;
33  import javax.xml.transform.stream.StreamResult;
34  
35  import org.apache.log4j.BasicConfigurator;
36  import org.apache.log4j.Level;
37  import org.apache.log4j.Logger;
38  import org.apache.log4j.SimpleLayout;
39  import org.apache.log4j.WriterAppender;
40  import org.apache.tika.gui.TikaGUI;
41  import org.apache.tika.metadata.Metadata;
42  import org.apache.tika.metadata.MetadataHelper;
43  import org.apache.tika.parser.AutoDetectParser;
44  import org.apache.tika.parser.ParseContext;
45  import org.apache.tika.parser.Parser;
46  import org.apache.tika.sax.BodyContentHandler;
47  import org.xml.sax.ContentHandler;
48  import org.xml.sax.helpers.DefaultHandler;
49  
50  /**
51   * Simple command line interface for Apache Tika.
52   */
53  public class TikaCLI {
54  
55      public static void main(String[] args) throws Exception {
56          BasicConfigurator.configure(
57                  new WriterAppender(new SimpleLayout(), System.err));
58          Logger.getRootLogger().setLevel(Level.INFO);
59  
60          TikaCLI cli = new TikaCLI();
61          for (int i = 0; i < args.length; i++) {
62              cli.process(args[i]);
63          }
64          if (cli.pipeMode) {
65              cli.process("-");
66          }
67      }
68  
69      private interface OutputType {
70          ContentHandler getContentHandler() throws Exception;
71      }
72  
73      private final OutputType XML = new OutputType() {
74          public ContentHandler getContentHandler() throws Exception {
75              return getTransformerHandler("xml", encoding);
76          }
77      };
78  
79      private final OutputType HTML = new OutputType() {
80          public ContentHandler getContentHandler() throws Exception {
81              return getTransformerHandler("html", encoding);
82          }
83      };
84  
85      private final OutputType TEXT = new OutputType() {
86          public ContentHandler getContentHandler() throws Exception {
87              return new BodyContentHandler(getSystemOutWriter(encoding));
88          }
89      };
90  
91      private final OutputType METADATA = new OutputType() {
92          public ContentHandler getContentHandler() throws Exception {
93              final PrintWriter writer =
94                  new PrintWriter(getSystemOutWriter(encoding));
95              return new DefaultHandler() {
96                  public void endDocument() {
97                      String[] names = metadata.names();
98                      Arrays.sort(names);
99                      for (String name : names) {
100                         writer.println(name + ": " + metadata.get(name));
101                     }
102 
103                     writer.flush();
104                 }
105             };
106         }
107     };
108 
109     private ParseContext context;
110 
111     private Parser parser;
112 
113     private Metadata metadata;
114 
115     private OutputType type = XML;
116 
117     /**
118      * Output character encoding, or <code>null</code> for platform default
119      */
120     private String encoding = null;
121 
122     private boolean pipeMode = true;
123 
124     public TikaCLI() throws TransformerConfigurationException {
125         context = new ParseContext();
126         parser = new AutoDetectParser();
127         context.set(Parser.class, parser);
128     }
129 
130     public void process(String arg) throws Exception {
131         if (arg.equals("-?") || arg.equals("--help")) {
132             pipeMode = false;
133             usage();
134         } else if (arg.equals("-v") || arg.equals("--verbose")) {
135             Logger.getRootLogger().setLevel(Level.DEBUG);
136         } else if (arg.equals("-g") || arg.equals("--gui")) {
137             pipeMode = false;
138             TikaGUI.main(new String[0]);
139         } else if (arg.startsWith("-e")) {
140             encoding = arg.substring("-e".length());
141         } else if (arg.startsWith("--encoding=")) {
142             encoding = arg.substring("--encoding=".length());
143         } else if (arg.equals("-x") || arg.equals("--xml")) {
144             type = XML;
145         } else if (arg.equals("-h") || arg.equals("--html")) {
146             type = HTML;
147         } else if (arg.equals("-t") || arg.equals("--text")) {
148             type = TEXT;
149         } else if (arg.equals("-m") || arg.equals("--metadata")) {
150             type = METADATA;
151         } else {
152             pipeMode = false;
153             metadata = new Metadata();
154             if (arg.equals("-")) {
155                 parser.parse(
156                         System.in, type.getContentHandler(),
157                         metadata, context);
158             } else {
159                 URL url;
160                 File file = new File(arg);
161                 if (file.isFile()) {
162                     url = file.toURI().toURL();
163                 } else {
164                     url = new URL(arg);
165                 }
166                 InputStream input =
167                     MetadataHelper.getInputStream(url, metadata);
168                 try {
169                     parser.parse(
170                             input, type.getContentHandler(),
171                             metadata, context);
172                 } finally {
173                     input.close();
174                 }
175             }
176         }
177     }
178 
179     private void usage() {
180         PrintStream out = System.out;
181         out.println("usage: tika [option] [file]");
182         out.println();
183         out.println("Options:");
184         out.println("    -?  or --help        Print this usage message");
185         out.println("    -v  or --verbose     Print debug level messages");
186         out.println("    -g  or --gui         Start the Apache Tika GUI");
187         out.println("    -eX or --encoding=X  Use output encoding X");
188         out.println("    -x  or --xml         Output XHTML content (default)");
189         out.println("    -h  or --html        Output HTML content");
190         out.println("    -t  or --text        Output plain text content");
191         out.println("    -m  or --metadata    Output only metadata");
192         out.println();
193         out.println("Description:");
194         out.println("    Apache Tika will parse the file(s) specified on the");
195         out.println("    command line and output the extracted text content");
196         out.println("    or metadata to standard output.");
197         out.println();
198         out.println("    Instead of a file name you can also specify the URL");
199         out.println("    of a document to be parsed.");
200         out.println();
201         out.println("    If no file name or URL is specified (or the special");
202         out.println("    name \"-\" is used), then the standard input stream");
203         out.println("    is parsed.");
204         out.println();
205         out.println("    Use the \"--gui\" (or \"-g\") option to start");
206         out.println("    the Apache Tika GUI. You can drag and drop files");
207         out.println("    from a normal file explorer to the GUI window to");
208         out.println("    extract text content and metadata from the files.");
209     }
210 
211     /**
212      * Returns a {@link System#out} writer with the given output encoding.
213      *
214      * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
215      * @param encoding output encoding,
216      *                 or <code>null</code> for the platform default
217      * @return {@link System#out} writer
218      * @throws UnsupportedEncodingException
219      *         if the configured encoding is not supported
220      */
221     private static Writer getSystemOutWriter(String encoding)
222             throws UnsupportedEncodingException {
223         if (encoding != null) {
224             return new OutputStreamWriter(System.out, encoding);
225         } else if (System.getProperty("os.name")
226                 .toLowerCase().startsWith("mac os x")) {
227             // TIKA-324: Override the default encoding on Mac OS X
228             return new OutputStreamWriter(System.out, "UTF-8");
229         } else {
230             return new OutputStreamWriter(System.out);
231         }
232     }
233 
234     /**
235      * Returns a transformer handler that serializes incoming SAX events
236      * to XHTML or HTML (depending the given method) using the given output
237      * encoding.
238      *
239      * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
240      * @param method "xml" or "html"
241      * @param encoding output encoding,
242      *                 or <code>null</code> for the platform default
243      * @return {@link System#out} transformer handler
244      * @throws TransformerConfigurationException
245      *         if the transformer can not be created
246      */
247     private static TransformerHandler getTransformerHandler(
248             String method, String encoding)
249             throws TransformerConfigurationException {
250         SAXTransformerFactory factory = (SAXTransformerFactory)
251                 SAXTransformerFactory.newInstance();
252         TransformerHandler handler = factory.newTransformerHandler();
253         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
254         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
255         if (encoding != null) {
256             handler.getTransformer().setOutputProperty(
257                     OutputKeys.ENCODING, encoding);
258         }
259         handler.setResult(new StreamResult(System.out));
260         return handler;
261     }
262 
263 }