1
2
3
4
5
6
7 package org.apache.tika.parser.txt;
8
9 import java.io.InputStream;
10 import java.io.Reader;
11 import java.io.IOException;
12 import java.nio.charset.Charset;
13 import java.util.ArrayList;
14 import java.util.Collections;
15 import java.util.Arrays;
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 public class CharsetDetector {
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 public CharsetDetector() {
56 }
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 public CharsetDetector setDeclaredEncoding(String encoding) {
75 setCanonicalDeclaredEncoding(encoding);
76 return this;
77 }
78
79
80
81
82
83
84
85
86
87
88 public CharsetDetector setText(byte [] in) {
89 fRawInput = in;
90 fRawLength = in.length;
91
92 MungeInput();
93
94 return this;
95 }
96
97 private static final int kBufSize = 8000;
98
99 private static final int MAX_CONFIDENCE = 100;
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 public CharsetDetector setText(InputStream in) throws IOException {
118 fInputStream = in;
119 fInputStream.mark(kBufSize);
120 fRawInput = new byte[kBufSize];
121
122
123 fRawLength = 0;
124 int remainingLength = kBufSize;
125 while (remainingLength > 0 ) {
126
127 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
128 if (bytesRead <= 0) {
129 break;
130 }
131 fRawLength += bytesRead;
132 remainingLength -= bytesRead;
133 }
134 fInputStream.reset();
135
136 MungeInput();
137 return this;
138 }
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160 public CharsetMatch detect() {
161
162
163
164
165 CharsetMatch matches[] = detectAll();
166
167 if (matches == null || matches.length == 0) {
168 return null;
169 }
170
171 return matches[0];
172 }
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189 public CharsetMatch[] detectAll() {
190 CharsetRecognizer csr;
191 int i;
192 int detectResults;
193 int confidence;
194 ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
195
196
197
198 for (i=0; i<fCSRecognizers.size(); i++) {
199 csr = fCSRecognizers.get(i);
200 detectResults = csr.match(this);
201 confidence = detectResults & 0x000000ff;
202 if (confidence > 0) {
203
204 confidence = Math.min(confidence, MAX_CONFIDENCE);
205
206
207 if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
208
209 confidence += (MAX_CONFIDENCE - confidence)/2;
210 }
211
212 CharsetMatch m = new CharsetMatch(this, csr, confidence);
213 matches.add(m);
214 }
215 }
216
217 Collections.sort(matches);
218 Collections.reverse(matches);
219 CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
220 resultArray = (CharsetMatch[]) matches.toArray(resultArray);
221 return resultArray;
222 }
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247 public Reader getReader(InputStream in, String declaredEncoding) {
248 setCanonicalDeclaredEncoding(declaredEncoding);
249
250 try {
251 setText(in);
252
253 CharsetMatch match = detect();
254
255 if (match == null) {
256 return null;
257 }
258
259 return match.getReader();
260 } catch (IOException e) {
261 return null;
262 }
263 }
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281 public String getString(byte[] in, String declaredEncoding) {
282 setCanonicalDeclaredEncoding(declaredEncoding);
283
284 try {
285 setText(in);
286
287 CharsetMatch match = detect();
288
289 if (match == null) {
290 return null;
291 }
292
293 return match.getString(-1);
294 } catch (IOException e) {
295 return null;
296 }
297 }
298
299
300
301
302
303
304
305
306
307
308 public static String[] getAllDetectableCharsets() {
309 return fCharsetNames;
310 }
311
312
313
314
315
316
317
318
319
320
321 public boolean inputFilterEnabled()
322 {
323 return fStripTags;
324 }
325
326
327
328
329
330
331
332
333
334
335
336
337 public boolean enableInputFilter(boolean filter)
338 {
339 boolean previous = fStripTags;
340
341 fStripTags = filter;
342
343 return previous;
344 }
345
346
347
348
349
350
351 private void setCanonicalDeclaredEncoding(String encoding) {
352 Charset cs = Charset.forName(encoding);
353 if (cs != null) {
354 fDeclaredEncoding = cs.name();
355 }
356 }
357
358
359
360
361
362 private void MungeInput() {
363 int srci = 0;
364 int dsti = 0;
365 byte b;
366 boolean inMarkup = false;
367 int openTags = 0;
368 int badTags = 0;
369
370
371
372
373
374
375
376 if (fStripTags) {
377 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
378 b = fRawInput[srci];
379 if (b == (byte)'<') {
380 if (inMarkup) {
381 badTags++;
382 }
383 inMarkup = true;
384 openTags++;
385 }
386
387 if (! inMarkup) {
388 fInputBytes[dsti++] = b;
389 }
390
391 if (b == (byte)'>') {
392 inMarkup = false;
393 }
394 }
395
396 fInputLen = dsti;
397 }
398
399
400
401
402
403
404 if (openTags<5 || openTags/5 < badTags ||
405 (fInputLen < 100 && fRawLength>600)) {
406 int limit = fRawLength;
407
408 if (limit > kBufSize) {
409 limit = kBufSize;
410 }
411
412 for (srci=0; srci<limit; srci++) {
413 fInputBytes[srci] = fRawInput[srci];
414 }
415 fInputLen = srci;
416 }
417
418
419
420
421
422 Arrays.fill(fByteStats, (short)0);
423 for (srci=0; srci<fInputLen; srci++) {
424 int val = fInputBytes[srci] & 0x00ff;
425 fByteStats[val]++;
426 }
427
428 fC1Bytes = false;
429 for (int i = 0x80; i <= 0x9F; i += 1) {
430 if (fByteStats[i] != 0) {
431 fC1Bytes = true;
432 break;
433 }
434 }
435 }
436
437
438
439
440
441
442 byte[] fInputBytes =
443 new byte[kBufSize];
444
445 int fInputLen;
446
447 short fByteStats[] =
448 new short[256];
449
450
451 boolean fC1Bytes =
452 false;
453
454 String fDeclaredEncoding;
455
456
457
458
459
460
461 byte[] fRawInput;
462
463
464
465 int fRawLength;
466
467 InputStream fInputStream;
468
469
470 boolean fStripTags =
471 false;
472
473
474
475
476
477 private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
478 private static String [] fCharsetNames;
479
480
481
482
483 private static ArrayList<CharsetRecognizer> createRecognizers() {
484 ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
485
486 recognizers.add(new CharsetRecog_UTF8());
487
488 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
489 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
490 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
491 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
492
493 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
494 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
495 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
496 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
497 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
498 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
499 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
500 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
501
502 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
503 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
504 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
505 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
506 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
507 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
508 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
509 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
510 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
511 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
512 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
513 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
514 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
515 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
516 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
517 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
518 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
519 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
520 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
521 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
522 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
523 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
524 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
525
526 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
527 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
528 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
529 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
530
531
532
533 String[] charsetNames = new String [recognizers.size()];
534 int out = 0;
535
536 for (int i = 0; i < recognizers.size(); i++) {
537 String name = ((CharsetRecognizer)recognizers.get(i)).getName();
538
539 if (out == 0 || ! name.equals(charsetNames[out - 1])) {
540 charsetNames[out++] = name;
541 }
542 }
543
544 fCharsetNames = new String[out];
545 System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
546
547 return recognizers;
548 }
549 }