1
2
3
4
5
6
7
8 package org.apache.tika.parser.txt;
9
10
11
12
13
14
15
16 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
17
18
19
20
21 abstract String getName();
22
23
24
25
26 abstract int match(CharsetDetector det);
27
28 static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
29 {
30 String getName()
31 {
32 return "UTF-16BE";
33 }
34
35 int match(CharsetDetector det)
36 {
37 byte[] input = det.fRawInput;
38
39 if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
40 return 100;
41 }
42
43
44 return 0;
45 }
46 }
47
48 static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
49 {
50 String getName()
51 {
52 return "UTF-16LE";
53 }
54
55 int match(CharsetDetector det)
56 {
57 byte[] input = det.fRawInput;
58
59 if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))
60 {
61
62 if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {
63
64 return 0;
65 }
66 return 100;
67 }
68
69
70 return 0;
71 }
72 }
73
74 static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
75 {
76 abstract int getChar(byte[] input, int index);
77
78 abstract String getName();
79
80 int match(CharsetDetector det)
81 {
82 byte[] input = det.fRawInput;
83 int limit = (det.fRawLength / 4) * 4;
84 int numValid = 0;
85 int numInvalid = 0;
86 boolean hasBOM = false;
87 int confidence = 0;
88
89 if (limit==0) {
90 return 0;
91 }
92 if (getChar(input, 0) == 0x0000FEFF) {
93 hasBOM = true;
94 }
95
96 for(int i = 0; i < limit; i += 4) {
97 int ch = getChar(input, i);
98
99 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
100 numInvalid += 1;
101 } else {
102 numValid += 1;
103 }
104 }
105
106
107
108
109 if (hasBOM && numInvalid==0) {
110 confidence = 100;
111 } else if (hasBOM && numValid > numInvalid*10) {
112 confidence = 80;
113 } else if (numValid > 3 && numInvalid == 0) {
114 confidence = 100;
115 } else if (numValid > 0 && numInvalid == 0) {
116 confidence = 80;
117 } else if (numValid > numInvalid*10) {
118
119 confidence = 25;
120 }
121
122 return confidence;
123 }
124 }
125
126 static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
127 {
128 int getChar(byte[] input, int index)
129 {
130 return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
131 (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
132 }
133
134 String getName()
135 {
136 return "UTF-32BE";
137 }
138 }
139
140
141 static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
142 {
143 int getChar(byte[] input, int index)
144 {
145 return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
146 (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
147 }
148
149 String getName()
150 {
151 return "UTF-32LE";
152 }
153 }
154 }