1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.txt;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.io.Reader;
25 import java.io.UnsupportedEncodingException;
26 import java.nio.charset.Charset;
27 import java.util.Collections;
28 import java.util.Set;
29
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.parser.Parser;
35 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 public class TXTParser implements Parser {
64
65 private static final Set<MediaType> SUPPORTED_TYPES =
66 Collections.singleton(MediaType.TEXT_PLAIN);
67
68 public Set<MediaType> getSupportedTypes(ParseContext context) {
69 return SUPPORTED_TYPES;
70 }
71
72 public void parse(
73 InputStream stream, ContentHandler handler,
74 Metadata metadata, ParseContext context)
75 throws IOException, SAXException, TikaException {
76
77
78 if (!stream.markSupported()) {
79 stream = new BufferedInputStream(stream);
80 }
81
82
83 CharsetDetector detector = new CharsetDetector();
84 String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
85 String incomingType = metadata.get(Metadata.CONTENT_TYPE);
86 if (incomingCharset == null && incomingType != null) {
87
88 MediaType mt = MediaType.parse(incomingType);
89 if (mt != null) {
90 incomingCharset = mt.getParameters().get("charset");
91 }
92 }
93
94 if (incomingCharset != null) {
95 detector.setDeclaredEncoding(incomingCharset);
96 }
97
98 detector.setText(stream);
99 for (CharsetMatch match : detector.detectAll()) {
100 if (Charset.isSupported(match.getName())) {
101 metadata.set(Metadata.CONTENT_ENCODING, match.getName());
102
103
104 String language = match.getLanguage();
105 if (language != null) {
106 metadata.add(Metadata.CONTENT_LANGUAGE, language);
107 metadata.add(Metadata.LANGUAGE, language);
108 }
109
110 break;
111 }
112 }
113
114 String encoding = metadata.get(Metadata.CONTENT_ENCODING);
115 if (encoding == null) {
116 throw new TikaException(
117 "Text encoding could not be detected and no encoding"
118 + " hint is available in document metadata");
119 }
120
121
122
123 metadata.set(Metadata.CONTENT_TYPE, "text/plain");
124
125 try {
126 Reader reader =
127 new BufferedReader(new InputStreamReader(stream, encoding));
128
129
130 reader.mark(1);
131 int bom = reader.read();
132 if (bom != '\ufeff') {
133 reader.reset();
134 }
135
136 XHTMLContentHandler xhtml =
137 new XHTMLContentHandler(handler, metadata);
138 xhtml.startDocument();
139
140 xhtml.startElement("p");
141 char[] buffer = new char[4096];
142 int n = reader.read(buffer);
143 while (n != -1) {
144 xhtml.characters(buffer, 0, n);
145 n = reader.read(buffer);
146 }
147 xhtml.endElement("p");
148
149 xhtml.endDocument();
150 } catch (UnsupportedEncodingException e) {
151 throw new TikaException(
152 "Unsupported text encoding: " + encoding, e);
153 }
154 }
155
156
157
158
159 public void parse(
160 InputStream stream, ContentHandler handler, Metadata metadata)
161 throws IOException, SAXException, TikaException {
162 parse(stream, handler, metadata, new ParseContext());
163 }
164
165 }