1
2
3
4
5
6
7
8 package org.apache.tika.parser.txt;
9
10 import java.util.Arrays;
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
27
28
29
30
31
32 abstract String getName() ;
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47 int match(CharsetDetector det, int [] commonChars) {
48 int singleByteCharCount = 0;
49 int doubleByteCharCount = 0;
50 int commonCharCount = 0;
51 int badCharCount = 0;
52 int totalCharCount = 0;
53 int confidence = 0;
54 iteratedChar iter = new iteratedChar();
55
56 detectBlock: {
57 for (iter.reset(); nextChar(iter, det);) {
58 totalCharCount++;
59 if (iter.error) {
60 badCharCount++;
61 } else {
62 long cv = iter.charValue & 0xFFFFFFFFL;
63
64 if (cv <= 0xff) {
65 singleByteCharCount++;
66 } else {
67 doubleByteCharCount++;
68 if (commonChars != null) {
69
70 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
71 commonCharCount++;
72 }
73 }
74 }
75 }
76 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
77
78 break detectBlock;
79 }
80 }
81
82 if (doubleByteCharCount <= 10 && badCharCount== 0) {
83
84 if (doubleByteCharCount == 0 && totalCharCount < 10) {
85
86
87
88 confidence = 0;
89 }
90 else {
91
92
93 confidence = 10;
94 }
95
96 break detectBlock;
97 }
98
99
100
101
102
103 if (doubleByteCharCount < 20*badCharCount) {
104 confidence = 0;
105 break detectBlock;
106 }
107
108 if (commonChars == null) {
109
110
111
112 confidence = 30 + doubleByteCharCount - 20*badCharCount;
113 if (confidence > 100) {
114 confidence = 100;
115 }
116 }else {
117
118
119
120 double maxVal = Math.log((float)doubleByteCharCount / 4);
121 double scaleFactor = 90.0 / maxVal;
122 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
123 confidence = Math.min(confidence, 100);
124 }
125 }
126
127 return confidence;
128 }
129
130
131
132
133
134
135
136
137
138
139
140
141 static class iteratedChar {
142 int charValue = 0;
143 int index = 0;
144 int nextIndex = 0;
145 boolean error = false;
146 boolean done = false;
147
148 void reset() {
149 charValue = 0;
150 index = -1;
151 nextIndex = 0;
152 error = false;
153 done = false;
154 }
155
156 int nextByte(CharsetDetector det) {
157 if (nextIndex >= det.fRawLength) {
158 done = true;
159 return -1;
160 }
161 int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
162 return byteValue;
163 }
164 }
165
166
167
168
169
170
171
172
173
174
175
176
177
178 abstract boolean nextChar(iteratedChar it, CharsetDetector det);
179
180
181
182
183
184
185
186
187
188 static class CharsetRecog_sjis extends CharsetRecog_mbcs {
189 static int [] commonChars =
190
191
192
193 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
194 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
195 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
196 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
197 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
198 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
199
200 boolean nextChar(iteratedChar it, CharsetDetector det) {
201 it.index = it.nextIndex;
202 it.error = false;
203 int firstByte;
204 firstByte = it.charValue = it.nextByte(det);
205 if (firstByte < 0) {
206 return false;
207 }
208
209 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
210 return true;
211 }
212
213 int secondByte = it.nextByte(det);
214 if (secondByte < 0) {
215 return false;
216 }
217 it.charValue = (firstByte << 8) | secondByte;
218 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
219
220 it.error = true;
221 }
222 return true;
223 }
224
225 int match(CharsetDetector det) {
226 return match(det, commonChars);
227 }
228
229 String getName() {
230 return "Shift_JIS";
231 }
232
233 public String getLanguage()
234 {
235 return "ja";
236 }
237
238
239 }
240
241
242
243
244
245
246 static class CharsetRecog_big5 extends CharsetRecog_mbcs {
247 static int [] commonChars =
248
249
250
251 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
252 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
253 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
254 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
255 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
256 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
257 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
258 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
259 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
260 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
261
262 boolean nextChar(iteratedChar it, CharsetDetector det) {
263 it.index = it.nextIndex;
264 it.error = false;
265 int firstByte;
266 firstByte = it.charValue = it.nextByte(det);
267 if (firstByte < 0) {
268 return false;
269 }
270
271 if (firstByte <= 0x7f || firstByte==0xff) {
272
273 return true;
274 }
275
276 int secondByte = it.nextByte(det);
277 if (secondByte < 0) {
278 return false;
279 }
280 it.charValue = (it.charValue << 8) | secondByte;
281
282 if (secondByte < 0x40 ||
283 secondByte ==0x7f ||
284 secondByte == 0xff) {
285 it.error = true;
286 }
287 return true;
288 }
289
290 int match(CharsetDetector det) {
291 return match(det, commonChars);
292 }
293
294 String getName() {
295 return "Big5";
296 }
297
298
299 public String getLanguage()
300 {
301 return "zh";
302 }
303 }
304
305
306
307
308
309
310
311
312 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
313
314
315
316
317
318
319
320 boolean nextChar(iteratedChar it, CharsetDetector det) {
321 it.index = it.nextIndex;
322 it.error = false;
323 int firstByte = 0;
324 int secondByte = 0;
325 int thirdByte = 0;
326
327
328 buildChar: {
329 firstByte = it.charValue = it.nextByte(det);
330 if (firstByte < 0) {
331
332 it.done = true;
333 break buildChar;
334 }
335 if (firstByte <= 0x8d) {
336
337 break buildChar;
338 }
339
340 secondByte = it.nextByte(det);
341 it.charValue = (it.charValue << 8) | secondByte;
342
343 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
344
345 if (secondByte < 0xa1) {
346 it.error = true;
347 }
348 break buildChar;
349 }
350 if (firstByte == 0x8e) {
351
352
353
354
355
356
357 if (secondByte < 0xa1) {
358 it.error = true;
359 }
360 break buildChar;
361 }
362
363 if (firstByte == 0x8f) {
364
365
366 thirdByte = it.nextByte(det);
367 it.charValue = (it.charValue << 8) | thirdByte;
368 if (thirdByte < 0xa1) {
369 it.error = true;
370 }
371 }
372 }
373
374 return (it.done == false);
375 }
376
377
378
379
380
381 static class CharsetRecog_euc_jp extends CharsetRecog_euc {
382 static int [] commonChars =
383
384
385
386 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
387 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
388 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
389 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
390 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
391 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
392 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
393 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
394 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
395 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
396 String getName() {
397 return "EUC-JP";
398 }
399
400 int match(CharsetDetector det) {
401 return match(det, commonChars);
402 }
403
404 public String getLanguage()
405 {
406 return "ja";
407 }
408 }
409
410
411
412
413
414 static class CharsetRecog_euc_kr extends CharsetRecog_euc {
415 static int [] commonChars =
416
417
418
419 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
420 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
421 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
422 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
423 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
424 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
425 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
426 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
427 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
428 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
429
430 String getName() {
431 return "EUC-KR";
432 }
433
434 int match(CharsetDetector det) {
435 return match(det, commonChars);
436 }
437
438 public String getLanguage()
439 {
440 return "ko";
441 }
442 }
443 }
444
445
446
447
448
449
450 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
451
452
453
454
455
456
457
458 boolean nextChar(iteratedChar it, CharsetDetector det) {
459 it.index = it.nextIndex;
460 it.error = false;
461 int firstByte = 0;
462 int secondByte = 0;
463 int thirdByte = 0;
464 int fourthByte = 0;
465
466 buildChar: {
467 firstByte = it.charValue = it.nextByte(det);
468
469 if (firstByte < 0) {
470
471 it.done = true;
472 break buildChar;
473 }
474
475 if (firstByte <= 0x80) {
476
477 break buildChar;
478 }
479
480 secondByte = it.nextByte(det);
481 it.charValue = (it.charValue << 8) | secondByte;
482
483 if (firstByte >= 0x81 && firstByte <= 0xFE) {
484
485 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
486 break buildChar;
487 }
488
489
490 if (secondByte >= 0x30 && secondByte <= 0x39) {
491 thirdByte = it.nextByte(det);
492
493 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
494 fourthByte = it.nextByte(det);
495
496 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
497 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
498 break buildChar;
499 }
500 }
501 }
502
503 it.error = true;
504 break buildChar;
505 }
506 }
507
508 return (it.done == false);
509 }
510
511 static int [] commonChars =
512
513
514
515 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
516 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
517 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
518 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
519 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
520 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
521 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
522 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
523 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
524 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
525
526
527 String getName() {
528 return "GB18030";
529 }
530
531 int match(CharsetDetector det) {
532 return match(det, commonChars);
533 }
534
535 public String getLanguage()
536 {
537 return "zh";
538 }
539 }
540
541
542 }