1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.detect; 18 19 import java.io.InputStream; 20 import java.io.UnsupportedEncodingException; 21 import java.net.URLDecoder; 22 import java.util.Map; 23 import java.util.regex.Pattern; 24 25 import org.apache.tika.metadata.Metadata; 26 import org.apache.tika.mime.MediaType; 27 28 /** 29 * Content type detection based on the resource name. An instance of this 30 * class contains a set of regular expression patterns that are matched 31 * against the resource name potentially given as a part of the input metadata. 32 * <p> 33 * If a pattern matches the given name, then the media type associated with 34 * that pattern is returned as the likely content type of the input document. 35 * Otherwise the returned type is <code>application/octet-stream</code>. 36 * <p> 37 * See the {@link #detect(InputStream, Metadata)} method for more details 38 * of the matching algorithm. 39 * 40 * @since Apache Tika 0.3 41 */ 42 public class NameDetector implements Detector { 43 44 /** 45 * The regular expression patterns used for type detection. 46 */ 47 private final Map<Pattern, MediaType> patterns; 48 49 /** 50 * Creates a new content type detector based on the given name patterns. 51 * The given pattern map is not copied, so the caller may update the 52 * mappings even after this detector instance has been created. However, 53 * the map <em>must not be concurrently modified</em> while this instance 54 * is used for type detection. 55 * 56 * @param patterns map from name patterns to corresponding media types 57 */ 58 public NameDetector(Map<Pattern, MediaType> patterns) { 59 this.patterns = patterns; 60 } 61 62 /** 63 * Detects the content type of an input document based on the document 64 * name given in the input metadata. The RESOURCE_NAME_KEY attribute of 65 * the given input metadata is expected to contain the name (normally 66 * a file name or a URL) of the input document. 67 * <p> 68 * If a resource name is given, then it is first processed as follows. 69 * <ol> 70 * <li> 71 * Potential URL query (?...) and fragment identifier (#...) 72 * parts are removed from the end of the resource name. 73 * </li> 74 * <li> 75 * Potential leading path elements (up to the last slash or backslash) 76 * are removed from the beginning of the resource name. 77 * </li> 78 * <li> 79 * Potential URL encodings (%nn, in UTF-8) are decoded. 80 * </li> 81 * <li> 82 * Any leading and trailing whitespace is removed. 83 * </li> 84 * </ol> 85 * <p> 86 * The resulting name string (if any) is then matched in sequence against 87 * all the configured name patterns. If a match is found, then the (first) 88 * matching media type is returned. 89 * 90 * @param input ignored 91 * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value 92 * @return detected media type, or <code>application/octet-stream</code> 93 */ 94 public MediaType detect(InputStream input, Metadata metadata) { 95 // Look for a resource name in the input metadata 96 String name = metadata.get(Metadata.RESOURCE_NAME_KEY); 97 if (name != null) { 98 // If the name is a URL, skip the trailing query and fragment parts 99 int question = name.indexOf('?'); 100 if (question != -1) { 101 name = name.substring(0, question); 102 } 103 int hash = name.indexOf('#'); 104 if (hash != -1) { 105 name = name.substring(0, hash); 106 } 107 108 // If the name is a URL or a path, skip all but the last component 109 int slash = name.lastIndexOf('/'); 110 if (slash != -1) { 111 name = name.substring(slash + 1); 112 } 113 int backslash = name.lastIndexOf('\\'); 114 if (backslash != -1) { 115 name = name.substring(backslash + 1); 116 } 117 118 // Decode any potential URL encoding 119 int percent = name.indexOf('%'); 120 if (percent != -1) { 121 try { 122 name = URLDecoder.decode(name, "UTF-8"); 123 } catch (UnsupportedEncodingException e) { 124 throw new IllegalStateException("UTF-8 not supported", e); 125 } 126 } 127 128 // Skip any leading or trailing whitespace 129 name = name.trim(); 130 if (name.length() > 0) { 131 // Match the name against the registered patterns 132 for (Pattern pattern : patterns.keySet()) { 133 if (pattern.matcher(name).matches()) { 134 return patterns.get(pattern); 135 } 136 } 137 } 138 } 139 140 return MediaType.OCTET_STREAM; 141 } 142 143 }