1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.detect;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  
23  import junit.framework.TestCase;
24  
25  import org.apache.tika.metadata.Metadata;
26  import org.apache.tika.mime.MediaType;
27  
28  /**
29   * Test cases for the {@link MagicDetector} class.
30   */
31  public class MagicDetectorTest extends TestCase {
32  
33      public void testDetectNull() throws Exception {
34          MediaType html = new MediaType("text", "html");
35          Detector detector = new MagicDetector(html, "<html".getBytes("ASCII"));
36          assertEquals(
37                  MediaType.OCTET_STREAM,
38                  detector.detect(null, new Metadata()));
39      }
40  
41      public void testDetectSimple() throws Exception {
42          MediaType html = new MediaType("text", "html");
43          Detector detector = new MagicDetector(html, "<html".getBytes("ASCII"));
44  
45          assertDetect(detector, html, "<html");
46          assertDetect(detector, html, "<html><head/><body/></html>");
47          assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
48          assertDetect(detector, MediaType.OCTET_STREAM, "<?xml?><html");
49          assertDetect(detector, MediaType.OCTET_STREAM, " <html");
50          assertDetect(detector, MediaType.OCTET_STREAM, "");
51      }
52  
53      public void testDetectOffsetRange() throws Exception {
54          MediaType html = new MediaType("text", "html");
55          Detector detector = new MagicDetector(
56                  html, "<html".getBytes("ASCII"), null, 0, 64);
57  
58          assertDetect(detector, html, "<html");
59          assertDetect(detector, html, "<html><head/><body/></html>");
60          assertDetect(detector, html, "<?xml?><html/>");
61          assertDetect(detector, html, "\n    <html");
62          assertDetect(detector, html, "\u0000<html");
63          assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
64          assertDetect(detector, MediaType.OCTET_STREAM, " html");
65          assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
66  
67          assertDetect(detector, html,
68                  "0........1.........2.........3.........4.........5.........6"
69                  + "1234<html");
70          assertDetect(detector, MediaType.OCTET_STREAM,
71                  "0........1.........2.........3.........4.........5.........6"
72                  + "12345<html");
73  
74          assertDetect(detector, MediaType.OCTET_STREAM, "");
75  }
76  
77      public void testDetectMask() throws Exception {
78          MediaType html = new MediaType("text", "html");
79          byte up = (byte) 0xdf;
80          Detector detector = new MagicDetector(
81                  html,
82                  new byte[] { '<',  'H',  'T',  'M',  'L' },
83                  new byte[] { (byte) 0xff, up, up, up, up },
84                  0, 64);
85  
86          assertDetect(detector, html, "<html");
87          assertDetect(detector, html, "<HTML><head/><body/></html>");
88          assertDetect(detector, html, "<?xml?><HtMl/>");
89          assertDetect(detector, html, "\n    <html");
90          assertDetect(detector, html, "\u0000<HTML");
91          assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
92          assertDetect(detector, MediaType.OCTET_STREAM, " html");
93  
94          assertDetect(detector, html,
95                  "0        1         2         3         4         5         6"
96                  + "1234<html");
97          assertDetect(detector, MediaType.OCTET_STREAM,
98                  "0        1         2         3         4         5         6"
99                  + "12345<html");
100 
101         assertDetect(detector, MediaType.OCTET_STREAM, "");
102     }
103 
104     private void assertDetect(Detector detector, MediaType type, String data) {
105         try {
106             byte[] bytes = data.getBytes("ASCII");
107             InputStream stream = new ByteArrayInputStream(bytes);
108             assertEquals(type, detector.detect(stream, new Metadata()));
109 
110             // Test that the stream has been reset
111             for (int i = 0; i < bytes.length; i++) {
112                 assertEquals(bytes[i], (byte) stream.read());
113             }
114             assertEquals(-1, stream.read());
115         } catch (IOException e) {
116             fail("Unexpected exception from MagicDetector");
117         }
118     }
119 
120 }