1
2
3
4
5
6
7 package org.apache.tika.parser.txt;
8
9
10
11
12
13
14
15
16
17
18
19 abstract class CharsetRecog_2022 extends CharsetRecognizer {
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 int match(byte [] text, int textLen, byte [][] escapeSequences) {
35 int i, j;
36 int escN;
37 int hits = 0;
38 int misses = 0;
39 int shifts = 0;
40 int quality;
41 scanInput:
42 for (i=0; i<textLen; i++) {
43 if (text[i] == 0x1b) {
44 checkEscapes:
45 for (escN=0; escN<escapeSequences.length; escN++) {
46 byte [] seq = escapeSequences[escN];
47
48 if ((textLen - i) < seq.length) {
49 continue checkEscapes;
50 }
51
52 for (j=1; j<seq.length; j++) {
53 if (seq[j] != text[i+j]) {
54 continue checkEscapes;
55 }
56 }
57
58 hits++;
59 i += seq.length-1;
60 continue scanInput;
61 }
62
63 misses++;
64 }
65
66 if (text[i] == 0x0e || text[i] == 0x0f) {
67
68 shifts++;
69 }
70 }
71
72 if (hits == 0) {
73 return 0;
74 }
75
76
77
78
79
80
81
82 quality = (100*hits - 100*misses) / (hits + misses);
83
84
85
86
87 if (hits+shifts < 5) {
88 quality -= (5-(hits+shifts))*10;
89 }
90
91 if (quality < 0) {
92 quality = 0;
93 }
94 return quality;
95 }
96
97
98
99
100 static class CharsetRecog_2022JP extends CharsetRecog_2022 {
101 private byte [] [] escapeSequences = {
102 {0x1b, 0x24, 0x28, 0x43},
103 {0x1b, 0x24, 0x28, 0x44},
104 {0x1b, 0x24, 0x40},
105 {0x1b, 0x24, 0x41},
106 {0x1b, 0x24, 0x42},
107 {0x1b, 0x26, 0x40},
108 {0x1b, 0x28, 0x42},
109 {0x1b, 0x28, 0x48},
110 {0x1b, 0x28, 0x49},
111 {0x1b, 0x28, 0x4a},
112 {0x1b, 0x2e, 0x41},
113 {0x1b, 0x2e, 0x46}
114 };
115
116 String getName() {
117 return "ISO-2022-JP";
118 }
119
120 int match(CharsetDetector det) {
121 return match(det.fInputBytes, det.fInputLen, escapeSequences);
122 }
123 }
124
125 static class CharsetRecog_2022KR extends CharsetRecog_2022 {
126 private byte [] [] escapeSequences = {
127 {0x1b, 0x24, 0x29, 0x43}
128 };
129
130 String getName() {
131 return "ISO-2022-KR";
132 }
133
134 int match(CharsetDetector det) {
135 return match(det.fInputBytes, det.fInputLen, escapeSequences);
136 }
137
138 }
139
140 static class CharsetRecog_2022CN extends CharsetRecog_2022 {
141 private byte [] [] escapeSequences = {
142 {0x1b, 0x24, 0x29, 0x41},
143 {0x1b, 0x24, 0x29, 0x47},
144 {0x1b, 0x24, 0x2A, 0x48},
145 {0x1b, 0x24, 0x29, 0x45},
146 {0x1b, 0x24, 0x2B, 0x49},
147 {0x1b, 0x24, 0x2B, 0x4A},
148 {0x1b, 0x24, 0x2B, 0x4B},
149 {0x1b, 0x24, 0x2B, 0x4C},
150 {0x1b, 0x24, 0x2B, 0x4D},
151 {0x1b, 0x4e},
152 {0x1b, 0x4f},
153 };
154
155 String getName() {
156 return "ISO-2022-CN";
157 }
158
159
160 int match(CharsetDetector det) {
161 return match(det.fInputBytes, det.fInputLen, escapeSequences);
162 }
163 }
164
165 }
166