View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.detect;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  
22  import org.apache.tika.metadata.Metadata;
23  import org.apache.tika.mime.MediaType;
24  
25  /**
26   * Content type detection based on magic bytes, i.e. type-specific patterns
27   * near the beginning of the document input stream.
28   *
29   * @since Apache Tika 0.3
30   */
31  public class MagicDetector implements Detector {
32  
33      /**
34       * The matching media type. Returned by the
35       * {@link #detect(InputStream, Metadata)} method if a match is found.
36       */
37      private final MediaType type;
38  
39      /**
40       * Length of the comparison window. All the byte arrays here are this long.
41       */
42      private final int length;
43  
44      /**
45       * The magic match pattern. If this byte pattern is equal to the
46       * possibly bit-masked bytes from the input stream, then the type
47       * detection succeeds and the configured {@link #type} is returned.
48       */
49      private final byte[] pattern;
50  
51      /**
52       * Bit mask that is applied to the source bytes before pattern matching.
53       */
54      private final byte[] mask;
55  
56      /**
57       * First offset (inclusive) of the comparison window within the
58       * document input stream. Greater than or equal to zero.
59       */
60      private final int offsetRangeBegin;
61  
62      /**
63       * Last offset (inclusive) of the comparison window within the document
64       * input stream. Greater than or equal to the
65       * {@link #offsetRangeBegin first offset}.
66       * <p>
67       * Note that this is <em>not</em> the offset of the last byte read from
68       * the document stream. Instead, the last window of bytes to be compared
69       * starts at this offset.
70       */
71      private final int offsetRangeEnd;
72  
73      /**
74       * Creates a detector for input documents that have the exact given byte
75       * pattern at the beginning of the document stream.
76       *
77       * @param type matching media type
78       * @param pattern magic match pattern
79       */
80      public MagicDetector(MediaType type, byte[] pattern) {
81          this(type, pattern, 0);
82      }
83  
84      /**
85       * Creates a detector for input documents that have the exact given byte
86       * pattern at the given offset of the document stream.
87       *
88       * @param type matching media type
89       * @param pattern magic match pattern
90       * @param offset offset of the pattern match
91       */
92      public MagicDetector(MediaType type, byte[] pattern, int offset) {
93          this(type, pattern, null, offset, offset);
94      }
95  
96      /**
97       * Creates a detector for input documents that meet the specified
98       * magic match.
99       */
100     public MagicDetector(
101             MediaType type, byte[] pattern, byte[] mask,
102             int offsetRangeBegin, int offsetRangeEnd) {
103         if (type == null) {
104             throw new IllegalArgumentException("Matching media type is null");
105         } else if (pattern == null) {
106             throw new IllegalArgumentException("Magic match pattern is null");
107         } else if (offsetRangeBegin < 0
108                 || offsetRangeEnd < offsetRangeBegin) {
109             throw new IllegalArgumentException(
110                     "Invalid offset range: ["
111                     + offsetRangeBegin + "," + offsetRangeEnd + "]");
112         }
113 
114         this.type = type;
115 
116         this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
117 
118         this.mask = new byte[length];
119         this.pattern = new byte[length];
120 
121         for (int i = 0; i < length; i++) {
122             if (mask != null && i < mask.length) {
123                 this.mask[i] = mask[i];
124             } else {
125                 this.mask[i] = -1;
126             }
127 
128             if (i < pattern.length) {
129                 this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
130             } else {
131                 this.pattern[i] = 0;
132             }
133         }
134 
135         this.offsetRangeBegin = offsetRangeBegin;
136         this.offsetRangeEnd = offsetRangeEnd;
137     }
138 
139     /**
140      * 
141      * @param input document input stream, or <code>null</code>
142      * @param metadata ignored
143      */
144     public MediaType detect(InputStream input, Metadata metadata)
145             throws IOException {
146         if (input == null) {
147             return MediaType.OCTET_STREAM;
148         }
149 
150         input.mark(offsetRangeEnd + length);
151         try {
152             int offset = 0;
153 
154             // Skip bytes at the beginning, using skip() or read()
155             while (offset < offsetRangeBegin) {
156                 long n = input.skip(offsetRangeBegin - offset);
157                 if (n > 0) {
158                     offset += n;
159                 } else if (input.read() != -1) {
160                     offset += 1;
161                 } else {
162                     return MediaType.OCTET_STREAM;
163                 }
164             }
165 
166             // Fill in the comparison window
167             byte[] buffer =
168                 new byte[length + (offsetRangeEnd - offsetRangeBegin)];
169             int n = input.read(buffer);
170             if (n > 0) {
171                 offset += n;
172             }
173             while (n != -1 && offset < offsetRangeEnd + length) {
174                 int bufferOffset = offset - offsetRangeBegin;
175                 n = input.read(
176                         buffer, bufferOffset, buffer.length - bufferOffset);
177             }
178             if (offset < offsetRangeBegin + length) {
179                 return MediaType.OCTET_STREAM;
180             }
181 
182             // Loop until we've covered the entire offset range
183             for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
184                 boolean match = true;
185                 for (int j = 0; match && j < length; j++) {
186                     match = (buffer[i + j] & mask[j]) == pattern[j];
187                 }
188                 if (match) {
189                     return type;
190                 }
191             }
192 
193             return MediaType.OCTET_STREAM;
194         } finally {
195             input.reset();
196         }
197     }
198 
199 }