View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.txt;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.InputStreamReader;
24  import java.io.Reader;
25  import java.io.UnsupportedEncodingException;
26  import java.nio.charset.Charset;
27  import java.util.Collections;
28  import java.util.Set;
29  
30  import org.apache.tika.exception.TikaException;
31  import org.apache.tika.metadata.Metadata;
32  import org.apache.tika.mime.MediaType;
33  import org.apache.tika.parser.ParseContext;
34  import org.apache.tika.parser.Parser;
35  import org.apache.tika.sax.XHTMLContentHandler;
36  import org.xml.sax.ContentHandler;
37  import org.xml.sax.SAXException;
38  
39  /**
40   * Plain text parser. The text encoding of the document stream is
41   * automatically detected based on the byte patterns found at the
42   * beginning of the stream. The input metadata key
43   * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING} is used
44   * as an encoding hint if the automatic encoding detection fails.
45   * <p>
46   * This parser sets the following output metadata entries:
47   * <dl>
48   *   <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
49   *   <dd><code>text/plain</code></dd>
50   *   <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING}</dt>
51   *   <dd>The detected text encoding of the document.</dd>
52   *   <dt>
53   *     {@link org.apache.tika.metadata.HttpHeaders#CONTENT_LANGUAGE} and
54   *     {@link org.apache.tika.metadata.DublinCore#LANGUAGE}
55   *   </dt>
56   *   <dd>
57   *     The default language of the detected encoding. Only set if the
58   *     detected encoding is associated with some specific language
59   *     (for example KOI8-R with Russian or SJIS with Japanese).
60   *   </dd>
61   * </dl>
62   */
63  public class TXTParser implements Parser {
64  
65      private static final Set<MediaType> SUPPORTED_TYPES =
66          Collections.singleton(MediaType.TEXT_PLAIN);
67  
68      public Set<MediaType> getSupportedTypes(ParseContext context) {
69          return SUPPORTED_TYPES;
70      }
71  
72      public void parse(
73              InputStream stream, ContentHandler handler,
74              Metadata metadata, ParseContext context)
75      throws IOException, SAXException, TikaException {
76  
77          // CharsetDetector expects a stream to support marks
78          if (!stream.markSupported()) {
79              stream = new BufferedInputStream(stream);
80          }
81  
82          // Detect the content encoding (the stream is reset to the beginning)
83          CharsetDetector detector = new CharsetDetector();
84          String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
85          String incomingType = metadata.get(Metadata.CONTENT_TYPE);
86          if (incomingCharset == null && incomingType != null) {
87              // TIKA-341: Use charset in content-type
88              MediaType mt = MediaType.parse(incomingType);
89              if (mt != null) {
90                  incomingCharset = mt.getParameters().get("charset");
91              }
92          }
93  
94          if (incomingCharset != null) {
95              detector.setDeclaredEncoding(incomingCharset);
96          }
97  
98          detector.setText(stream);
99          for (CharsetMatch match : detector.detectAll()) {
100             if (Charset.isSupported(match.getName())) {
101                 metadata.set(Metadata.CONTENT_ENCODING, match.getName());
102 
103                 // Is the encoding language-specific (KOI8-R, SJIS, etc.)?
104                 String language = match.getLanguage();
105                 if (language != null) {
106                     metadata.add(Metadata.CONTENT_LANGUAGE, language);
107                     metadata.add(Metadata.LANGUAGE, language);
108                 }
109 
110                 break;
111             }
112         }
113 
114         String encoding = metadata.get(Metadata.CONTENT_ENCODING);
115         if (encoding == null) {
116             throw new TikaException(
117                     "Text encoding could not be detected and no encoding"
118                     + " hint is available in document metadata");
119         }
120 
121         // TIKA-341: Only stomp on content-type after we're done trying to
122         // use it to guess at the charset.
123         metadata.set(Metadata.CONTENT_TYPE, "text/plain");
124 
125         try {
126             Reader reader =
127                 new BufferedReader(new InputStreamReader(stream, encoding));
128 
129             // TIKA-240: Drop the BOM when extracting plain text
130             reader.mark(1);
131             int bom = reader.read();
132             if (bom != '\ufeff') { // zero-width no-break space
133                 reader.reset();
134             }
135 
136             XHTMLContentHandler xhtml =
137                 new XHTMLContentHandler(handler, metadata);
138             xhtml.startDocument();
139 
140             xhtml.startElement("p");
141             char[] buffer = new char[4096];
142             int n = reader.read(buffer);
143             while (n != -1) {
144                 xhtml.characters(buffer, 0, n);
145                 n = reader.read(buffer);
146             }
147             xhtml.endElement("p");
148 
149             xhtml.endDocument();
150         } catch (UnsupportedEncodingException e) {
151             throw new TikaException(
152                     "Unsupported text encoding: " + encoding, e);
153         }
154     }
155 
156     /**
157      * @deprecated This method will be removed in Apache Tika 1.0.
158      */
159     public void parse(
160             InputStream stream, ContentHandler handler, Metadata metadata)
161     throws IOException, SAXException, TikaException {
162         parse(stream, handler, metadata, new ParseContext());
163     }
164 
165 }