1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.detect; 18 19 import java.io.IOException; 20 import java.io.InputStream; 21 import java.util.Arrays; 22 23 import org.apache.tika.metadata.Metadata; 24 import org.apache.tika.mime.MediaType; 25 26 /** 27 * Content type detection of plain text documents. This detector looks at the 28 * beginning of the document input stream and considers the document to be 29 * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are 30 * found. 31 * <p> 32 * Note that text documents with a character encoding like UTF-16 are better 33 * detected with {@link MagicDetector} and an appropriate magic byte pattern. 34 * 35 * @since Apache Tika 0.3 36 */ 37 public class TextDetector implements Detector { 38 39 /** 40 * The number of bytes from the beginning of the document stream 41 * to test for control bytes. 42 */ 43 private static final int NUMBER_OF_BYTES_TO_TEST = 512; 44 45 /** 46 * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes 47 * in the range below 0x20 (the space character). If an entry in this 48 * table is <code>true</code> then that byte is very unlikely to occur 49 * in a plain text document. 50 * <p> 51 * The contents of this lookup table are based on the following definition 52 * from section 4 of the "Content-Type Processing Model" Internet-draft 53 * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt" 54 * >draft-abarth-mime-sniff-01</a>). 55 * <pre> 56 * +-------------------------+ 57 * | Binary data byte ranges | 58 * +-------------------------+ 59 * | 0x00 -- 0x08 | 60 * | 0x0B | 61 * | 0x0E -- 0x1A | 62 * | 0x1C -- 0x1F | 63 * +-------------------------+ 64 * </pre> 65 * 66 * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a> 67 */ 68 private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20]; 69 70 static { 71 Arrays.fill(IS_CONTROL_BYTE, true); 72 IS_CONTROL_BYTE[0x09] = false; // tabulator 73 IS_CONTROL_BYTE[0x0A] = false; // new line 74 IS_CONTROL_BYTE[0x0C] = false; // new page 75 IS_CONTROL_BYTE[0x0D] = false; // carriage return 76 IS_CONTROL_BYTE[0x1B] = false; // escape 77 } 78 79 /** 80 * Looks at the beginning of the document input stream to determine 81 * whether the document is text or not. 82 * 83 * @param input document input stream, or <code>null</code> 84 * @param metadata ignored 85 * @return "text/plain" if the input stream suggest a text document, 86 * "application/octet-stream" otherwise 87 */ 88 public MediaType detect(InputStream input, Metadata metadata) 89 throws IOException { 90 if (input == null) { 91 return MediaType.OCTET_STREAM; 92 } 93 94 input.mark(NUMBER_OF_BYTES_TO_TEST); 95 try { 96 for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) { 97 int ch = input.read(); 98 if (ch == -1) { 99 return MediaType.TEXT_PLAIN; 100 } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) { 101 return MediaType.OCTET_STREAM; 102 } 103 } 104 return MediaType.TEXT_PLAIN; 105 } finally { 106 input.reset(); 107 } 108 } 109 110 }