1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import java.net.HttpURLConnection;
23 import java.util.regex.Pattern;
24 import java.util.regex.Matcher;
25 import java.text.MessageFormat;
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 public class XmlReader extends Reader {
52 private static final int PUSHBACK_MAX_SIZE = 4096;
53
54 private static final String UTF_8 = "UTF-8";
55 private static final String US_ASCII = "US-ASCII";
56 private static final String UTF_16BE = "UTF-16BE";
57 private static final String UTF_16LE = "UTF-16LE";
58 private static final String UTF_16 = "UTF-16";
59
60 private Reader _reader;
61 private String _encoding;
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76 public XmlReader(File file) throws IOException {
77 this(new FileInputStream(file));
78 }
79
80
81
82
83
84
85
86
87
88
89
90
91
92 public XmlReader(InputStream is) throws IOException {
93 this(is,true);
94 }
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120 public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
121 try {
122 doRawStream(is,lenient);
123 }
124 catch (XmlReaderException ex) {
125 if (!lenient) {
126 throw ex;
127 }
128 else {
129 doLenientDetection(null,ex);
130 }
131 }
132 }
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150 public XmlReader(URL url) throws IOException {
151 this(url.openConnection());
152 }
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170 public XmlReader(URLConnection conn) throws IOException {
171 boolean lenient = true;
172 if (conn instanceof HttpURLConnection) {
173 try {
174 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
175 }
176 catch (XmlReaderException ex) {
177 doLenientDetection(conn.getContentType(),ex);
178 }
179 }
180 else
181 if (conn.getContentType()!=null) {
182 try {
183 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
184 }
185 catch (XmlReaderException ex) {
186 doLenientDetection(conn.getContentType(),ex);
187 }
188 }
189 else {
190 try {
191 doRawStream(conn.getInputStream(),lenient);
192 }
193 catch (XmlReaderException ex) {
194 doLenientDetection(null,ex);
195 }
196 }
197 }
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214 public XmlReader(InputStream is,String httpContentType) throws IOException {
215 this(is,httpContentType,true);
216 }
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246 public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException {
247 try {
248 doHttpStream(is,httpContentType,lenient);
249 }
250 catch (XmlReaderException ex) {
251 if (!lenient) {
252 throw ex;
253 }
254 else {
255 doLenientDetection(httpContentType,ex);
256 }
257 }
258 }
259
260 private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
261 if (httpContentType!=null) {
262 if (httpContentType.startsWith("text/html")) {
263 httpContentType = httpContentType.substring("text/html".length());
264 httpContentType = "text/xml" + httpContentType;
265 try {
266 doHttpStream(ex.getInputStream(),httpContentType,true);
267 ex = null;
268 }
269 catch (XmlReaderException ex2) {
270 ex = ex2;
271 }
272 }
273 }
274 if (ex!=null) {
275 String encoding = ex.getXmlEncoding();
276 if (encoding==null) {
277 encoding = ex.getContentTypeEncoding();
278 }
279 if (encoding==null) {
280 encoding = UTF_8;
281 }
282 prepareReader(ex.getInputStream(),encoding);
283 }
284 }
285
286
287
288
289
290
291
292 public String getEncoding() {
293 return _encoding;
294 }
295
296 public int read(char[] buf,int offset,int len) throws IOException {
297 return _reader.read(buf,offset,len);
298 }
299
300
301
302
303
304
305
306 public void close() throws IOException {
307 _reader.close();
308 }
309
310 private void doRawStream(InputStream is,boolean lenient) throws IOException {
311 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
312 String bomEnc = getBOMEncoding(pis);
313 String xmlGuessEnc = getXMLGuessEncoding(pis);
314 String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
315 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
316 prepareReader(pis,encoding);
317 }
318
319 private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException {
320 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
321 String cTMime = getContentTypeMime(httpContentType);
322 String cTEnc = getContentTypeEncoding(httpContentType);
323 String bomEnc = getBOMEncoding(pis);
324 String xmlGuessEnc = getXMLGuessEncoding(pis);
325 String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
326 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
327 prepareReader(pis,encoding);
328 }
329
330 private void prepareReader(InputStream is,String encoding) throws IOException {
331 _reader = new InputStreamReader(is,encoding);
332 _encoding = encoding;
333 }
334
335
336 private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
337 String encoding;
338 if (bomEnc==null) {
339 if (xmlGuessEnc==null || xmlEnc==null) {
340 encoding = UTF_8;
341 }
342 else
343 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
344 encoding = xmlGuessEnc;
345 }
346 else {
347 encoding = xmlEnc;
348 }
349 }
350 else
351 if (bomEnc.equals(UTF_8)) {
352 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
353 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
354 bomEnc,xmlGuessEnc,xmlEnc,is);
355 }
356 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
357 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
358 bomEnc,xmlGuessEnc,xmlEnc,is);
359 }
360 encoding = UTF_8;
361 }
362 else
363 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
364 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
365 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
366 }
367 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
368 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
369 bomEnc,xmlGuessEnc,xmlEnc,is);
370 }
371 encoding =bomEnc;
372 }
373 else {
374 throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
375 bomEnc,xmlGuessEnc,xmlEnc,is);
376 }
377 return encoding;
378 }
379
380
381 private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException {
382 String encoding;
383 if (lenient & xmlEnc!=null) {
384 encoding = xmlEnc;
385 }
386 else {
387 boolean appXml = isAppXml(cTMime);
388 boolean textXml = isTextXml(cTMime);
389 if (appXml || textXml) {
390 if (cTEnc==null) {
391 if (appXml) {
392 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
393 }
394 else {
395 encoding = US_ASCII;
396 }
397 }
398 else
399 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
400 throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
401 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
402 }
403 else
404 if (cTEnc.equals(UTF_16)) {
405 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
406 encoding = bomEnc;
407 }
408 else {
409 throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
410 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
411 }
412 }
413 else {
414 encoding = cTEnc;
415 }
416 }
417 else {
418 throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
419 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
420 }
421 }
422 return encoding;
423 }
424
425
426 private static String getContentTypeMime(String httpContentType) {
427 String mime = null;
428 if (httpContentType!=null) {
429 int i = httpContentType.indexOf(";");
430 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
431 }
432 return mime;
433 }
434
435 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
436
437
438 private static String getContentTypeEncoding(String httpContentType) {
439 String encoding = null;
440 if (httpContentType!=null) {
441 int i = httpContentType.indexOf(";");
442 if (i>-1) {
443 String postMime = httpContentType.substring(i+1);
444 Matcher m = CHARSET_PATTERN.matcher(postMime);
445 encoding = (m.find()) ? m.group(1) : null;
446 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
447 }
448 }
449 return encoding;
450 }
451
452
453
454 private static String getBOMEncoding(PushbackInputStream is) throws IOException {
455 String encoding = null;
456 int[] bytes = new int[3];
457 bytes[0] = is.read();
458 bytes[1] = is.read();
459 bytes[2] = is.read();
460
461 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
462 encoding = UTF_16BE;
463 is.unread(bytes[2]);
464 }
465 else
466 if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
467 encoding = UTF_16LE;
468 is.unread(bytes[2]);
469 }
470 else
471 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
472 encoding = UTF_8;
473 }
474 else {
475 for (int i=bytes.length-1;i>=0;i--) {
476 is.unread(bytes[i]);
477 }
478 }
479 return encoding;
480 }
481
482
483 private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
484 String encoding = null;
485 int[] bytes = new int[4];
486 bytes[0] = is.read();
487 bytes[1] = is.read();
488 bytes[2] = is.read();
489 bytes[3] = is.read();
490 for (int i=bytes.length-1;i>=0;i--) {
491 is.unread(bytes[i]);
492 }
493
494 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
495 encoding = UTF_16BE;
496 }
497 else
498 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
499 encoding = UTF_16LE;
500 }
501 else
502 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
503 encoding = UTF_8;
504 }
505 return encoding;
506 }
507
508
509 private static final Pattern ENCODING_PATTERN =
510 Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*')).*\\?>", Pattern.MULTILINE);
511
512
513 private static String getXmlProlog(PushbackInputStream is,String guessedEnc) throws IOException {
514 String encoding = null;
515 if (guessedEnc!=null) {
516 byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
517 int offset = 0;
518 int max = PUSHBACK_MAX_SIZE;
519 int c = is.read(bytes,offset,max);
520 while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
521 offset += c;
522 max -= c;
523 c = is.read(bytes,offset,max);
524 }
525 int bytesRead = offset;
526 if (bytesRead>0) {
527 is.unread(bytes,0,bytesRead);
528 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
529 BufferedReader br = new BufferedReader(reader);
530 StringBuffer prolog = new StringBuffer(PUSHBACK_MAX_SIZE);
531 String line = br.readLine();
532 while (line != null) {
533 prolog.append(line).append("\n");
534 line = br.readLine();
535 }
536 Matcher m = ENCODING_PATTERN.matcher(prolog);
537 if (m.find()) {
538 encoding = m.group(1).toUpperCase();
539 encoding = encoding.substring(1,encoding.length()-1);
540 }
541 }
542 }
543 return encoding;
544 }
545
546
547 private static boolean isAppXml(String mime) {
548 return mime!=null &&
549 (mime.equals("application/xml") ||
550 mime.equals("application/xml-dtd") ||
551 mime.equals("application/xml-external-parsed-entity") ||
552 (mime.startsWith("application/") && mime.endsWith("+xml")));
553 }
554
555
556 private static boolean isTextXml(String mime) {
557 return mime!=null &&
558 (mime.equals("text/xml") ||
559 mime.equals("text/xml-external-parsed-entity") ||
560 (mime.startsWith("text/") && mime.endsWith("+xml")));
561 }
562
563 private static final MessageFormat RAW_EX_1 = new MessageFormat(
564 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
565
566 private static final MessageFormat RAW_EX_2 = new MessageFormat(
567 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
568
569 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
570 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
571
572 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
573 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
574
575 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
576 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
577
578 }