1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.cli;
18
19 import java.io.File;
20 import java.io.InputStream;
21 import java.io.OutputStreamWriter;
22 import java.io.PrintStream;
23 import java.io.PrintWriter;
24 import java.io.UnsupportedEncodingException;
25 import java.io.Writer;
26 import java.net.URL;
27 import java.util.Arrays;
28
29 import javax.xml.transform.OutputKeys;
30 import javax.xml.transform.TransformerConfigurationException;
31 import javax.xml.transform.sax.SAXTransformerFactory;
32 import javax.xml.transform.sax.TransformerHandler;
33 import javax.xml.transform.stream.StreamResult;
34
35 import org.apache.log4j.BasicConfigurator;
36 import org.apache.log4j.Level;
37 import org.apache.log4j.Logger;
38 import org.apache.log4j.SimpleLayout;
39 import org.apache.log4j.WriterAppender;
40 import org.apache.tika.gui.TikaGUI;
41 import org.apache.tika.metadata.Metadata;
42 import org.apache.tika.metadata.MetadataHelper;
43 import org.apache.tika.parser.AutoDetectParser;
44 import org.apache.tika.parser.ParseContext;
45 import org.apache.tika.parser.Parser;
46 import org.apache.tika.sax.BodyContentHandler;
47 import org.xml.sax.ContentHandler;
48 import org.xml.sax.helpers.DefaultHandler;
49
50
51
52
53 public class TikaCLI {
54
55 public static void main(String[] args) throws Exception {
56 BasicConfigurator.configure(
57 new WriterAppender(new SimpleLayout(), System.err));
58 Logger.getRootLogger().setLevel(Level.INFO);
59
60 TikaCLI cli = new TikaCLI();
61 for (int i = 0; i < args.length; i++) {
62 cli.process(args[i]);
63 }
64 if (cli.pipeMode) {
65 cli.process("-");
66 }
67 }
68
69 private interface OutputType {
70 ContentHandler getContentHandler() throws Exception;
71 }
72
73 private final OutputType XML = new OutputType() {
74 public ContentHandler getContentHandler() throws Exception {
75 return getTransformerHandler("xml", encoding);
76 }
77 };
78
79 private final OutputType HTML = new OutputType() {
80 public ContentHandler getContentHandler() throws Exception {
81 return getTransformerHandler("html", encoding);
82 }
83 };
84
85 private final OutputType TEXT = new OutputType() {
86 public ContentHandler getContentHandler() throws Exception {
87 return new BodyContentHandler(getSystemOutWriter(encoding));
88 }
89 };
90
91 private final OutputType METADATA = new OutputType() {
92 public ContentHandler getContentHandler() throws Exception {
93 final PrintWriter writer =
94 new PrintWriter(getSystemOutWriter(encoding));
95 return new DefaultHandler() {
96 public void endDocument() {
97 String[] names = metadata.names();
98 Arrays.sort(names);
99 for (String name : names) {
100 writer.println(name + ": " + metadata.get(name));
101 }
102
103 writer.flush();
104 }
105 };
106 }
107 };
108
109 private ParseContext context;
110
111 private Parser parser;
112
113 private Metadata metadata;
114
115 private OutputType type = XML;
116
117
118
119
120 private String encoding = null;
121
122 private boolean pipeMode = true;
123
124 public TikaCLI() throws TransformerConfigurationException {
125 context = new ParseContext();
126 parser = new AutoDetectParser();
127 context.set(Parser.class, parser);
128 }
129
130 public void process(String arg) throws Exception {
131 if (arg.equals("-?") || arg.equals("--help")) {
132 pipeMode = false;
133 usage();
134 } else if (arg.equals("-v") || arg.equals("--verbose")) {
135 Logger.getRootLogger().setLevel(Level.DEBUG);
136 } else if (arg.equals("-g") || arg.equals("--gui")) {
137 pipeMode = false;
138 TikaGUI.main(new String[0]);
139 } else if (arg.startsWith("-e")) {
140 encoding = arg.substring("-e".length());
141 } else if (arg.startsWith("--encoding=")) {
142 encoding = arg.substring("--encoding=".length());
143 } else if (arg.equals("-x") || arg.equals("--xml")) {
144 type = XML;
145 } else if (arg.equals("-h") || arg.equals("--html")) {
146 type = HTML;
147 } else if (arg.equals("-t") || arg.equals("--text")) {
148 type = TEXT;
149 } else if (arg.equals("-m") || arg.equals("--metadata")) {
150 type = METADATA;
151 } else {
152 pipeMode = false;
153 metadata = new Metadata();
154 if (arg.equals("-")) {
155 parser.parse(
156 System.in, type.getContentHandler(),
157 metadata, context);
158 } else {
159 URL url;
160 File file = new File(arg);
161 if (file.isFile()) {
162 url = file.toURI().toURL();
163 } else {
164 url = new URL(arg);
165 }
166 InputStream input =
167 MetadataHelper.getInputStream(url, metadata);
168 try {
169 parser.parse(
170 input, type.getContentHandler(),
171 metadata, context);
172 } finally {
173 input.close();
174 }
175 }
176 }
177 }
178
179 private void usage() {
180 PrintStream out = System.out;
181 out.println("usage: tika [option] [file]");
182 out.println();
183 out.println("Options:");
184 out.println(" -? or --help Print this usage message");
185 out.println(" -v or --verbose Print debug level messages");
186 out.println(" -g or --gui Start the Apache Tika GUI");
187 out.println(" -eX or --encoding=X Use output encoding X");
188 out.println(" -x or --xml Output XHTML content (default)");
189 out.println(" -h or --html Output HTML content");
190 out.println(" -t or --text Output plain text content");
191 out.println(" -m or --metadata Output only metadata");
192 out.println();
193 out.println("Description:");
194 out.println(" Apache Tika will parse the file(s) specified on the");
195 out.println(" command line and output the extracted text content");
196 out.println(" or metadata to standard output.");
197 out.println();
198 out.println(" Instead of a file name you can also specify the URL");
199 out.println(" of a document to be parsed.");
200 out.println();
201 out.println(" If no file name or URL is specified (or the special");
202 out.println(" name \"-\" is used), then the standard input stream");
203 out.println(" is parsed.");
204 out.println();
205 out.println(" Use the \"--gui\" (or \"-g\") option to start");
206 out.println(" the Apache Tika GUI. You can drag and drop files");
207 out.println(" from a normal file explorer to the GUI window to");
208 out.println(" extract text content and metadata from the files.");
209 }
210
211
212
213
214
215
216
217
218
219
220
221 private static Writer getSystemOutWriter(String encoding)
222 throws UnsupportedEncodingException {
223 if (encoding != null) {
224 return new OutputStreamWriter(System.out, encoding);
225 } else if (System.getProperty("os.name")
226 .toLowerCase().startsWith("mac os x")) {
227
228 return new OutputStreamWriter(System.out, "UTF-8");
229 } else {
230 return new OutputStreamWriter(System.out);
231 }
232 }
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247 private static TransformerHandler getTransformerHandler(
248 String method, String encoding)
249 throws TransformerConfigurationException {
250 SAXTransformerFactory factory = (SAXTransformerFactory)
251 SAXTransformerFactory.newInstance();
252 TransformerHandler handler = factory.newTransformerHandler();
253 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
254 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
255 if (encoding != null) {
256 handler.getTransformer().setOutputProperty(
257 OutputKeys.ENCODING, encoding);
258 }
259 handler.setResult(new StreamResult(System.out));
260 return handler;
261 }
262
263 }