View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.mime;
18  
19  // JDK imports
20  import java.io.ByteArrayInputStream;
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.net.URI;
25  import java.net.URISyntaxException;
26  import java.net.URL;
27  import java.util.Arrays;
28  import java.util.HashMap;
29  import java.util.Map;
30  import java.util.SortedSet;
31  import java.util.TreeSet;
32  
33  import javax.xml.namespace.QName;
34  
35  import org.apache.tika.detect.Detector;
36  import org.apache.tika.detect.XmlRootExtractor;
37  import org.apache.tika.metadata.Metadata;
38  
39  /**
40   * This class is a MimeType repository. It gathers a set of MimeTypes and
41   * enables to retrieves a content-type from its name, from a file name, or from
42   * a magic character sequence.
43   * <p>
44   * The MIME type detection methods that take an {@link InputStream} as
45   * an argument will never reads more than {@link #getMinLength()} bytes
46   * from the stream. Also the given stream is never
47   * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
48   * or {@link InputStream#reset() reset} by the methods. Thus a client can
49   * use the {@link InputStream#markSupported() mark feature} of the stream
50   * (if available) to restore the stream back to the state it was before type
51   * detection if it wants to process the stream based on the detected type.
52   */
53  public final class MimeTypes implements Detector {
54  
55      /**
56       * Name of the {@link #rootMimeType root} type, application/octet-stream.
57       */
58      public static final String OCTET_STREAM = "application/octet-stream";
59  
60      /**
61       * Name of the {@link #textMimeType text} type, text/plain.
62       */
63      public static final String PLAIN_TEXT = "text/plain";
64      
65      /**
66       * Name of the {@link #xml xml} type, application/xml.
67       */
68      public static final String XML = "application/xml";
69  
70  
71      
72      /**
73       * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
74       * in the range below 0x20 (the space character). If an entry in this
75       * table is <code>true</code> then that byte is very unlikely to occur
76       * in a plain text document.
77       * <p>
78       * The contents of this lookup table are based on the following definition
79       * from section 4 of the "Content-Type Processing Model" Internet-draft
80       * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
81       * >draft-abarth-mime-sniff-01</a>).
82       * <pre>
83       * +-------------------------+
84       * | Binary data byte ranges |
85       * +-------------------------+
86       * | 0x00 -- 0x08            |
87       * | 0x0B                    |
88       * | 0x0E -- 0x1A            |
89       * | 0x1C -- 0x1F            |
90       * +-------------------------+
91       * </pre>
92       *
93       * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
94       */
95      private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
96      static {
97          Arrays.fill(IS_CONTROL_BYTE, true);
98          IS_CONTROL_BYTE[0x09] = false; // tabulator
99          IS_CONTROL_BYTE[0x0A] = false; // new line
100         IS_CONTROL_BYTE[0x0C] = false; // new page
101         IS_CONTROL_BYTE[0x0D] = false; // carriage return
102         IS_CONTROL_BYTE[0x1B] = false; // escape
103     }
104 
105     /**
106      * Root type, application/octet-stream.
107      */
108     private final MimeType rootMimeType;
109 
110     /**
111      * Text type, text/plain.
112      */
113     private final MimeType textMimeType;
114 
115     /*
116      * xml type, application/xml
117      */
118     private final MimeType xmlMimeType;
119     
120     /** All the registered MimeTypes indexed on their name */
121     private final Map<String, MimeType> types = new HashMap<String, MimeType>();
122 
123     /** The patterns matcher */
124     private Patterns patterns = new Patterns();
125 
126     /** List of all registered magics */
127     private SortedSet<Magic> magics = new TreeSet<Magic>();
128 
129     /** List of all registered rootXML */
130     private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
131 
132     private final XmlRootExtractor xmlRootExtractor;
133 
134     public MimeTypes() {
135         rootMimeType = new MimeType(this, OCTET_STREAM);
136         textMimeType = new MimeType(this, PLAIN_TEXT);
137         xmlMimeType = new MimeType(this, XML);
138         
139         try {
140             textMimeType.setSuperType(rootMimeType);
141             xmlMimeType.setSuperType(rootMimeType);
142         } catch (MimeTypeException e) {
143             throw new IllegalStateException("Error in MimeType logic", e);
144         }
145 
146         types.put(rootMimeType.getName(), rootMimeType);
147         types.put(textMimeType.getName(), textMimeType);
148         types.put(xmlMimeType.getName(), xmlMimeType);
149 
150         try {
151             xmlRootExtractor = new XmlRootExtractor();
152         } catch (Exception e) {
153             throw new IllegalStateException(
154                     "Unable to create a XmlRootExtractor", e);
155         }
156     }
157 
158     /**
159      * Find the Mime Content Type of a file.
160      *
161      * @param file
162      *            to analyze.
163      * @return the Mime Content Type of the specified file, or <code>null</code>
164      *         if none is found.
165      */
166     public MimeType getMimeType(File file) {
167         return getMimeType(file.getName());
168     }
169 
170     /**
171      * Find the Mime Content Type of a document from its URL.
172      *
173      * @param url
174      *            of the document to analyze.
175      * @return the Mime Content Type of the specified document URL, or
176      *         <code>null</code> if none is found.
177      */
178     public MimeType getMimeType(URL url) {
179         return getMimeType(url.getPath());
180     }
181 
182     /**
183      * Find the Mime Content Type of a document from its name.
184      * Returns application/octet-stream if no better match is found.
185      *
186      * @param name of the document to analyze.
187      * @return the Mime Content Type of the specified document name
188      */
189     public MimeType getMimeType(String name) {
190         MimeType type = patterns.matches(name);
191         if (type != null) {
192             return type;
193         }
194         type = patterns.matches(name.toLowerCase());
195         if (type != null) {
196             return type;
197         } else {
198             return rootMimeType;
199         }
200     }
201 
202     /**
203      * Returns the MIME type that best matches the given first few bytes
204      * of a document stream. Returns application/octet-stream if no better
205      * match is found.
206      * <p>
207      * The given byte array is expected to be at least {@link #getMinLength()}
208      * long, or shorter only if the document stream itself is shorter.
209      *
210      * @param data first few bytes of a document stream
211      * @return matching MIME type
212      */
213     public MimeType getMimeType(byte[] data) {
214         if (data == null) {
215             throw new IllegalArgumentException("Data is missing");
216         }
217 
218         // Then, check for magic bytes
219         MimeType result = null;
220         for (Magic magic : magics) {
221             if (magic.eval(data)) {
222                 result = magic.getType();
223                 break;
224             }
225         }
226         
227         if (result != null) {
228             // When detecting generic XML (or possibly XHTML),
229             // extract the root element and match it against known types
230             if ("application/xml".equals(result.getName())
231                     || "text/html".equals(result.getName())) {
232                 QName rootElement = xmlRootExtractor.extractRootElement(data);
233                 if (rootElement != null) {
234                     for (MimeType type : xmls) {
235                         if (type.matchesXML(
236                                 rootElement.getNamespaceURI(),
237                                 rootElement.getLocalPart())) {
238                             result = type;
239                             break;
240                         }
241                     }
242                 }
243             }
244             return result;
245         }
246 
247 
248         // Finally, assume plain text if no control bytes are found
249         for (int i = 0; i < data.length; i++) {
250             int b = data[i] & 0xFF; // prevent sign extension
251             if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) {
252                 return rootMimeType;
253             }
254         }
255         return textMimeType;
256     }
257 
258     /**
259      * Returns the MIME type that best matches the first few bytes of the
260      * given document stream.
261      *
262      * @see #getMimeType(byte[])
263      * @param stream document stream
264      * @return matching MIME type, or <code>null</code> if no match is found
265      * @throws IOException if the stream can be read
266      */
267     public MimeType getMimeType(InputStream stream) throws IOException {
268         return getMimeType(readMagicHeader(stream));
269     }
270 
271     /**
272      * Reads the first {@link #getMinLength()} bytes from the given stream.
273      * If the stream is shorter, then the entire content of the stream is
274      * returned.
275      * <p>
276      * The given stream is never {@link InputStream#close() closed},
277      * {@link InputStream#mark(int) marked}, or
278      * {@link InputStream#reset() reset} by this method.
279      *
280      * @param stream stream to be read
281      * @return first {@link #getMinLength()} (or fewer) bytes of the stream
282      * @throws IOException if the stream can not be read
283      */
284     private byte[] readMagicHeader(InputStream stream) throws IOException {
285         if (stream == null) {
286             throw new IllegalArgumentException("InputStream is missing");
287         }
288 
289         byte[] bytes = new byte[getMinLength()];
290         int totalRead = 0;
291 
292         int lastRead = stream.read(bytes);
293         while (lastRead != -1) {
294             totalRead += lastRead;
295             if (totalRead == bytes.length) {
296                 return bytes;
297             }
298             lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
299         }
300 
301         byte[] shorter = new byte[totalRead];
302         System.arraycopy(bytes, 0, shorter, 0, totalRead);
303         return shorter;
304     }
305 
306     public String getType(String typeName, String url, byte[] data) {
307         try {
308             Metadata metadata = new Metadata();
309             if (url != null) {
310                 metadata.set(Metadata.RESOURCE_NAME_KEY, url);
311             }
312             if (typeName != null) {
313                 metadata.set(Metadata.CONTENT_TYPE, typeName);
314             }
315             return detect(new ByteArrayInputStream(data), metadata).toString();
316         } catch (IOException e) {
317             throw new IllegalStateException(
318                     "ByteArrayInputStream throws an IOException!", e);
319         }
320     }
321 
322     /**
323      * Determines the MIME type of the resource pointed to by the specified URL.
324      * Examines the file's header, and if it cannot determine the MIME type
325      * from the header, guesses the MIME type from the URL extension
326      * (e.g. "pdf).
327      *
328      * @param url URL of the document
329      * @return type of the document
330      * @throws IOException if the document can not be accessed
331      */
332     public String getType(URL url) throws IOException {
333         InputStream stream = url.openStream();
334         try {
335             Metadata metadata = new Metadata();
336             metadata.set(Metadata.RESOURCE_NAME_KEY, url.toString());
337             return detect(stream, metadata).toString();
338         } finally {
339             stream.close();
340         }
341     }
342 
343     /**
344      * Find the Mime Content Type of a document from its name and its content.
345      * The policy used to guess the Mime Content Type is:
346      * <ol>
347      * <li>Try to find the type based on the provided data.</li>
348      * <li>If a type is found, then return it, otherwise try to find the type
349      * based on the file name</li>
350      * </ol>
351      *
352      * @param name
353      *            of the document to analyze.
354      * @param data
355      *            are the first bytes of the document's content.
356      * @return the Mime Content Type of the specified document, or
357      *         <code>null</code> if none is found.
358      * @see #getMinLength()
359      */
360     public MimeType getMimeType(String name, byte[] data) {
361         // First, try to get the mime-type from the content
362         MimeType mimeType = getMimeType(data);
363 
364         // If no mime-type found, then try to get the mime-type from
365         // the document name
366         if (mimeType == null) {
367             mimeType = getMimeType(name);
368         }
369 
370         return mimeType;
371     }
372 
373     /**
374      * Returns the MIME type that best matches the given document name and
375      * the first few bytes of the given document stream.
376      *
377      * @see #getMimeType(String, byte[])
378      * @param name document name
379      * @param stream document stream
380      * @return matching MIME type, or <code>null</code> if no match is found
381      * @throws IOException if the stream can not be read
382      */
383     public MimeType getMimeType(String name, InputStream stream)
384             throws IOException {
385         return getMimeType(name, readMagicHeader(stream));
386     }
387 
388     /**
389      * Returns the registered media type with the given name (or alias).
390      * The named media type is automatically registered (and returned) if
391      * it doesn't already exist.
392      *
393      * @param name media type name (case-insensitive)
394      * @return the registered media type with the given name or alias
395      * @throws MimeTypeException if the given media type name is invalid
396      */
397     public synchronized MimeType forName(String name)
398             throws MimeTypeException {
399         if (MimeType.isValid(name)) {
400             name = name.toLowerCase();
401             MimeType type = types.get(name);
402             if (type == null) {
403                 type = new MimeType(this, name);
404                 if (name.startsWith("text/")) {
405                     type.setSuperType(textMimeType);
406                 } else if (name.endsWith("+xml")) {
407                 	type.setSuperType(xmlMimeType);
408                 } else {
409                     type.setSuperType(rootMimeType);
410                 }
411                 types.put(name, type);
412             }
413             return type;
414         } else {
415             throw new MimeTypeException("Invalid media type name: " + name);
416         }
417     }
418 
419     /**
420      * Adds an alias for the given media type. This method should only
421      * be called from {@link MimeType#addAlias(String)}.
422      *
423      * @param type media type
424      * @param alias media type alias (normalized to lower case)
425      * @throws MimeTypeException if the alias already exists
426      */
427     synchronized void addAlias(MimeType type, String alias)
428             throws MimeTypeException {
429         if (!types.containsKey(alias)) {
430             types.put(alias, type);
431         } else {
432             throw new MimeTypeException(
433                     "Media type alias already exists: " + alias);
434         }
435     }
436 
437     /**
438      * Adds a file name pattern for the given media type. Assumes that the
439      * pattern being added is <b>not</b> a JDK standard regular expression.
440      *
441      * @param type
442      *            media type
443      * @param pattern
444      *            file name pattern
445      * @throws MimeTypeException
446      *             if the pattern conflicts with existing ones
447      */
448     public void addPattern(MimeType type, String pattern)
449             throws MimeTypeException {
450         this.addPattern(type, pattern, false);
451     }
452 
453     /**
454      * Adds a file name pattern for the given media type. The caller can specify
455      * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard
456      * regular expression via the <code>isRegex</code> parameter. If the value
457      * is set to true, then a JDK standard regex is assumed, otherwise the
458      * freedesktop glob type is assumed.
459      *
460      * @param type
461      *            media type
462      * @param pattern
463      *            file name pattern
464      * @param isRegex
465      *            set to true if JDK std regexs are desired, otherwise set to
466      *            false.
467      * @throws MimeTypeException
468      *             if the pattern conflicts with existing ones.
469      *
470      */
471     public void addPattern(MimeType type, String pattern, boolean isRegex)
472             throws MimeTypeException {
473         patterns.add(pattern, isRegex, type);
474     }
475 
476     /**
477      * Return the minimum length of data to provide to analyzing methods based
478      * on the document's content in order to check all the known MimeTypes.
479      *
480      * @return the minimum length of data to provide.
481      * @see #getMimeType(byte[])
482      * @see #getMimeType(String, byte[])
483      */
484     public int getMinLength() {
485         // This needs to be reasonably large to be able to correctly detect
486         // things like XML root elements after initial comment and DTDs
487         return 8 * 1024;
488     }
489 
490     /**
491      * Add the specified mime-type in the repository.
492      *
493      * @param type
494      *            is the mime-type to add.
495      */
496     void add(MimeType type) {
497         // Update the magics index...
498         if (type.hasMagic()) {
499             magics.addAll(Arrays.asList(type.getMagics()));
500         }
501 
502         // Update the xml (xmlRoot) index...
503         if (type.hasRootXML()) {
504             xmls.add(type);
505         }
506     }
507 
508     /**
509      * Automatically detects the MIME type of a document based on magic
510      * markers in the stream prefix and any given metadata hints.
511      * <p>
512      * The given stream is expected to support marks, so that this method
513      * can reset the stream to the position it was in before this method
514      * was called.
515      *
516      * @param input document stream, or <code>null</code>
517      * @param metadata metadata hints
518      * @return MIME type of the document
519      * @throws IOException if the document stream could not be read
520      */
521     public MediaType detect(InputStream input, Metadata metadata)
522             throws IOException {
523         MimeType type = rootMimeType;
524 
525         // Get type based on magic prefix
526         if (input != null) {
527             input.mark(getMinLength());
528             try {
529                 byte[] prefix = readMagicHeader(input);
530                 type = getMimeType(prefix);
531             } finally {
532                 input.reset();
533             }
534         }
535 
536         // Get type based on resourceName hint (if available)
537         String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
538         if (resourceName != null) {
539             String name = null;
540 
541             // Deal with a URI or a path name in as the resource  name
542             try {
543                 URI uri = new URI(resourceName);
544                 String path = uri.getPath();
545                 if (path != null) {
546                     int slash = path.lastIndexOf('/');
547                     if (slash + 1 < path.length()) {
548                         name = path.substring(slash + 1);
549                     }
550                 }
551             } catch (URISyntaxException e) {
552                 name = resourceName;
553             }
554 
555             if (name != null) {
556                 MimeType hint = getMimeType(name);
557                 if (hint.isDescendantOf(type)) {
558                     type = hint;
559                 }
560             }
561         }
562 
563         // Get type based on metadata hint (if available)
564         String typeName = metadata.get(Metadata.CONTENT_TYPE);
565         if (typeName != null) {
566             try {
567                 MimeType hint = forName(typeName);
568                 if (hint.isDescendantOf(type)) {
569                     type = hint;
570                 }
571             } catch (MimeTypeException e) {
572                 // Malformed type name, ignore
573             }
574         }
575 
576         return MediaType.parse(type.getName());
577     }
578 
579 }