View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.detect;
18  
19  import java.io.InputStream;
20  import java.io.UnsupportedEncodingException;
21  import java.net.URLDecoder;
22  import java.util.Map;
23  import java.util.regex.Pattern;
24  
25  import org.apache.tika.metadata.Metadata;
26  import org.apache.tika.mime.MediaType;
27  
28  /**
29   * Content type detection based on the resource name. An instance of this
30   * class contains a set of regular expression patterns that are matched
31   * against the resource name potentially given as a part of the input metadata.
32   * <p>
33   * If a pattern matches the given name, then the media type associated with
34   * that pattern is returned as the likely content type of the input document.
35   * Otherwise the returned type is <code>application/octet-stream</code>.
36   * <p>
37   * See the {@link #detect(InputStream, Metadata)} method for more details
38   * of the matching algorithm.
39   *
40   * @since Apache Tika 0.3
41   */
42  public class NameDetector implements Detector {
43  
44      /**
45       * The regular expression patterns used for type detection.
46       */
47      private final Map<Pattern, MediaType> patterns;
48  
49      /**
50       * Creates a new content type detector based on the given name patterns.
51       * The given pattern map is not copied, so the caller may update the
52       * mappings even after this detector instance has been created. However,
53       * the map <em>must not be concurrently modified</em> while this instance
54       * is used for type detection.
55       *
56       * @param patterns map from name patterns to corresponding media types
57       */
58      public NameDetector(Map<Pattern, MediaType> patterns) {
59          this.patterns = patterns;
60      }
61  
62      /**
63       * Detects the content type of an input document based on the document
64       * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
65       * the given input metadata is expected to contain the name (normally
66       * a file name or a URL) of the input document.
67       * <p>
68       * If a resource name is given, then it is first processed as follows.
69       * <ol>
70       *   <li>
71       *     Potential URL query (?...) and fragment identifier (#...)
72       *     parts are removed from the end of the resource name.
73       *   </li>
74       *   <li>
75       *     Potential leading path elements (up to the last slash or backslash)
76       *     are removed from the beginning of the resource name.
77       *   </li>
78       *   <li>
79       *     Potential URL encodings (%nn, in UTF-8) are decoded.
80       *   </li>
81       *   <li>
82       *     Any leading and trailing whitespace is removed.
83       *   </li>
84       * </ol>
85       * <p>
86       * The resulting name string (if any) is then matched in sequence against
87       * all the configured name patterns. If a match is found, then the (first)
88       * matching media type is returned.
89       *
90       * @param input ignored
91       * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
92       * @return detected media type, or <code>application/octet-stream</code>
93       */
94      public MediaType detect(InputStream input, Metadata metadata) {
95          // Look for a resource name in the input metadata
96          String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
97          if (name != null) {
98              // If the name is a URL, skip the trailing query and fragment parts
99              int question = name.indexOf('?');
100             if (question != -1) {
101                 name = name.substring(0, question);
102             }
103             int hash = name.indexOf('#');
104             if (hash != -1) {
105                 name = name.substring(0, hash);
106             }
107 
108             // If the name is a URL or a path, skip all but the last component
109             int slash = name.lastIndexOf('/');
110             if (slash != -1) {
111                 name = name.substring(slash + 1);
112             }
113             int backslash = name.lastIndexOf('\\');
114             if (backslash != -1) {
115                 name = name.substring(backslash + 1);
116             }
117 
118             // Decode any potential URL encoding
119             int percent = name.indexOf('%');
120             if (percent != -1) {
121                 try {
122                     name = URLDecoder.decode(name, "UTF-8");
123                 } catch (UnsupportedEncodingException e) {
124                     throw new IllegalStateException("UTF-8 not supported", e);
125                 }
126             }
127 
128             // Skip any leading or trailing whitespace
129             name = name.trim();
130             if (name.length() > 0) {
131                 // Match the name against the registered patterns
132                 for (Pattern pattern : patterns.keySet()) {
133                     if (pattern.matcher(name).matches()) {
134                         return patterns.get(pattern);
135                     }
136                 }
137             }
138         }
139 
140         return MediaType.OCTET_STREAM;
141     }
142 
143 }