View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.detect;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.util.Arrays;
22  
23  import org.apache.tika.metadata.Metadata;
24  import org.apache.tika.mime.MediaType;
25  
26  /**
27   * Content type detection of plain text documents. This detector looks at the
28   * beginning of the document input stream and considers the document to be
29   * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
30   * found.
31   * <p>
32   * Note that text documents with a character encoding like UTF-16 are better
33   * detected with {@link MagicDetector} and an appropriate magic byte pattern.
34   *
35   * @since Apache Tika 0.3
36   */
37  public class TextDetector implements Detector {
38  
39      /**
40       * The number of bytes from the beginning of the document stream
41       * to test for control bytes.
42       */
43      private static final int NUMBER_OF_BYTES_TO_TEST = 512;
44  
45      /**
46       * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
47       * in the range below 0x20 (the space character). If an entry in this
48       * table is <code>true</code> then that byte is very unlikely to occur
49       * in a plain text document.
50       * <p>
51       * The contents of this lookup table are based on the following definition
52       * from section 4 of the "Content-Type Processing Model" Internet-draft
53       * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
54       * >draft-abarth-mime-sniff-01</a>).
55       * <pre>
56       * +-------------------------+
57       * | Binary data byte ranges |
58       * +-------------------------+
59       * | 0x00 -- 0x08            |
60       * | 0x0B                    |
61       * | 0x0E -- 0x1A            |
62       * | 0x1C -- 0x1F            |
63       * +-------------------------+
64       * </pre>
65       *
66       * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
67       */
68      private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
69  
70      static {
71          Arrays.fill(IS_CONTROL_BYTE, true);
72          IS_CONTROL_BYTE[0x09] = false; // tabulator
73          IS_CONTROL_BYTE[0x0A] = false; // new line
74          IS_CONTROL_BYTE[0x0C] = false; // new page
75          IS_CONTROL_BYTE[0x0D] = false; // carriage return
76          IS_CONTROL_BYTE[0x1B] = false; // escape
77      }
78  
79      /**
80       * Looks at the beginning of the document input stream to determine
81       * whether the document is text or not.
82       *
83       * @param input document input stream, or <code>null</code>
84       * @param metadata ignored
85       * @return "text/plain" if the input stream suggest a text document,
86       *         "application/octet-stream" otherwise
87       */
88      public MediaType detect(InputStream input, Metadata metadata)
89              throws IOException {
90          if (input == null) {
91              return MediaType.OCTET_STREAM;
92          }
93  
94          input.mark(NUMBER_OF_BYTES_TO_TEST);
95          try {
96              for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
97                  int ch = input.read();
98                  if (ch == -1) {
99                      return MediaType.TEXT_PLAIN;
100                 } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
101                     return MediaType.OCTET_STREAM;
102                 }
103             }
104             return MediaType.TEXT_PLAIN;
105         } finally {
106             input.reset();
107         }
108     }
109 
110 }