View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io;
18  
19  import java.io.*;
20  import java.net.URL;
21  import java.net.URLConnection;
22  import java.net.HttpURLConnection;
23  import java.util.regex.Pattern;
24  import java.util.regex.Matcher;
25  import java.text.MessageFormat;
26  
27  /**
28   * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29   * the charset encoding of the XML document within the stream.
30   * <p>
31   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32   * character stream.
33   * <p>
34   * All this has to be done without consuming characters from the stream, if not the XML parser
35   * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36   * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
37   * parsers).
38   * <p>
39   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40   * HTTP streams by offering a wide set of constructors.
41   * <P>
42   * By default the charset encoding detection is lenient, the constructor with the lenient flag
43   * can be used for an script (following HTTP MIME and XML specifications).
44   * All this is nicely explained by Mark Pilgrim in his blog,
45   * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
46   * Determining the character encoding of a feed</a>.
47   * <p>
48   * @author Alejandro Abdelnur
49   *
50   */
51  public class XmlReader extends Reader {
52      private static final int PUSHBACK_MAX_SIZE = 4096;
53  
54      private static final String UTF_8 = "UTF-8";
55      private static final String US_ASCII = "US-ASCII";
56      private static final String UTF_16BE = "UTF-16BE";
57      private static final String UTF_16LE = "UTF-16LE";
58      private static final String UTF_16 = "UTF-16";
59  
60      private Reader _reader;
61      private String _encoding;
62  
63      /**
64       * Creates a Reader for a File.
65       * <p>
66       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
67       * missing defaults to UTF-8.
68       * <p>
69       * It does a lenient charset encoding detection, check the constructor with the lenient parameter
70       * for details.
71       * <p>
72       * @param file File to create a Reader from.
73       * @throws IOException thrown if there is a problem reading the file.
74       *
75       */
76      public XmlReader(File file) throws IOException {
77          this(new FileInputStream(file));
78      }
79  
80      /**
81       * Creates a Reader for a raw InputStream.
82       * <p>
83       * It follows the same logic used for files.
84       * <p>
85       * It does a lenient charset encoding detection, check the constructor with the lenient parameter
86       * for details.
87       * <p>
88       * @param is InputStream to create a Reader from.
89       * @throws IOException thrown if there is a problem reading the stream.
90       *
91       */
92      public XmlReader(InputStream is) throws IOException {
93          this(is,true);
94      }
95  
96      /**
97       * Creates a Reader for a raw InputStream.
98       * <p>
99       * It follows the same logic used for files.
100      * <p>
101      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
102      * the following:
103      * <p>
104      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
105      * <p>
106      * Else if the XML prolog had a charset encoding that encoding is used.
107      * <p>
108      * Else if the content type had a charset encoding that encoding is used.
109      * <p>
110      * Else 'UTF-8' is used.
111      * <p>
112      * If lenient detection is indicated an XmlReaderException is never thrown.
113      * <p>
114      * @param is InputStream to create a Reader from.
115      * @param lenient indicates if the charset encoding detection should be relaxed.
116      * @throws IOException thrown if there is a problem reading the stream.
117      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
118      *
119      */
120     public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
121         try {
122             doRawStream(is,lenient);
123         }
124         catch (XmlReaderException ex) {
125             if (!lenient) {
126                 throw ex;
127             }
128             else {
129                 doLenientDetection(null,ex);
130             }
131         }
132     }
133 
134     /**
135      * Creates a Reader using the InputStream of a URL.
136      * <p>
137      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
138      * data it uses the same logic used for Files.
139      * <p>
140      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
141      * data it uses the same logic used for an InputStream with content-type.
142      * <p>
143      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
144      * for details.
145      * <p>
146      * @param url URL to create a Reader from.
147      * @throws IOException thrown if there is a problem reading the stream of the URL.
148      *
149      */
150     public XmlReader(URL url) throws IOException {
151         this(url.openConnection());
152     }
153 
154     /**
155      * Creates a Reader using the InputStream of a URLConnection.
156      * <p>
157      * If the URLConnection is not of type HttpURLConnection and there is not
158      * 'content-type' header in the fetched data it uses the same logic used for files.
159      * <p>
160      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
161      * data it uses the same logic used for an InputStream with content-type.
162      * <p>
163      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
164      * for details.
165      * <p>
166      * @param conn URLConnection to create a Reader from.
167      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
168      *
169      */
170     public XmlReader(URLConnection conn) throws IOException {
171         boolean lenient = true;
172         if (conn instanceof HttpURLConnection) {
173             try {
174                 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
175             }
176             catch (XmlReaderException ex) {
177                 doLenientDetection(conn.getContentType(),ex);
178             }
179         }
180         else
181         if (conn.getContentType()!=null) {
182             try {
183                 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
184             }
185             catch (XmlReaderException ex) {
186                 doLenientDetection(conn.getContentType(),ex);
187             }
188         }
189         else {
190             try {
191                 doRawStream(conn.getInputStream(),lenient);
192             }
193             catch (XmlReaderException ex) {
194                 doLenientDetection(null,ex);
195             }
196         }
197     }
198 
199     /**
200      * Creates a Reader using an InputStream an the associated content-type header.
201      * <p>
202      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
203      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
204      * prolog encoding uses the default encoding mandated by the content-type MIME type.
205      * <p>
206      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
207      * for details.
208      * <p>
209      * @param is InputStream to create the reader from.
210      * @param httpContentType content-type header to use for the resolution of the charset encoding.
211      * @throws IOException thrown if there is a problem reading the file.
212      *
213      */
214     public XmlReader(InputStream is,String httpContentType) throws IOException {
215         this(is,httpContentType,true);
216     }
217 
218     /**
219      * Creates a Reader using an InputStream an the associated content-type header. This constructor is
220      * lenient regarding the encoding detection.
221      * <p>
222      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
223      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
224      * prolog encoding uses the default encoding mandated by the content-type MIME type.
225      * <p>
226      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
227      * the following:
228      * <p>
229      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
230      * <p>
231      * Else if the XML prolog had a charset encoding that encoding is used.
232      * <p>
233      * Else if the content type had a charset encoding that encoding is used.
234      * <p>
235      * Else 'UTF-8' is used.
236      * <p>
237      * If lenient detection is indicated an XmlReaderException is never thrown.
238      * <p>
239      * @param is InputStream to create the reader from.
240      * @param httpContentType content-type header to use for the resolution of the charset encoding.
241      * @param lenient indicates if the charset encoding detection should be relaxed.
242      * @throws IOException thrown if there is a problem reading the file.
243      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
244      *
245      */
246     public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException {
247         try {
248             doHttpStream(is,httpContentType,lenient);
249         }
250         catch (XmlReaderException ex) {
251             if (!lenient) {
252                 throw ex;
253             }
254             else {
255                 doLenientDetection(httpContentType,ex);
256             }
257         }
258     }
259 
260     private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
261         if (httpContentType!=null) {
262             if (httpContentType.startsWith("text/html")) {
263                 httpContentType = httpContentType.substring("text/html".length());
264                 httpContentType = "text/xml" + httpContentType;
265                 try {
266                     doHttpStream(ex.getInputStream(),httpContentType,true);
267                     ex = null;
268                 }
269                 catch (XmlReaderException ex2) {
270                     ex = ex2;
271                 }
272             }
273         }
274         if (ex!=null) {
275             String encoding = ex.getXmlEncoding();
276             if (encoding==null) {
277                 encoding = ex.getContentTypeEncoding();
278             }
279             if (encoding==null) {
280                 encoding = UTF_8;
281             }
282             prepareReader(ex.getInputStream(),encoding);
283         }
284     }
285 
286     /**
287      * Returns the charset encoding of the XmlReader.
288      * <p>
289      * @return charset encoding.
290      *
291      */
292     public String getEncoding() {
293         return _encoding;
294     }
295 
296     public int read(char[] buf,int offset,int len) throws IOException {
297         return _reader.read(buf,offset,len);
298     }
299 
300     /**
301      * Closes the XmlReader stream.
302      * <p>
303      * @throws IOException thrown if there was a problem closing the stream.
304      *
305      */
306     public void close() throws IOException {
307         _reader.close();
308     }
309 
310     private void doRawStream(InputStream is,boolean lenient) throws IOException {
311         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
312         String bomEnc = getBOMEncoding(pis);
313         String xmlGuessEnc =  getXMLGuessEncoding(pis);
314         String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
315         String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
316         prepareReader(pis,encoding);
317     }
318 
319     private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException {
320         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
321         String cTMime = getContentTypeMime(httpContentType);
322         String cTEnc  = getContentTypeEncoding(httpContentType);
323         String bomEnc = getBOMEncoding(pis);
324         String xmlGuessEnc =  getXMLGuessEncoding(pis);
325         String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
326         String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
327         prepareReader(pis,encoding);
328     }
329 
330     private void prepareReader(InputStream is,String encoding) throws IOException {
331         _reader = new InputStreamReader(is,encoding);
332         _encoding = encoding;
333     }
334 
335     // InputStream is passed for XmlReaderException creation only
336     private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
337         String encoding;
338         if (bomEnc==null) {
339             if (xmlGuessEnc==null || xmlEnc==null) {
340                 encoding = UTF_8;
341             }
342             else
343             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
344                 encoding = xmlGuessEnc;
345             }
346             else {
347                 encoding = xmlEnc;
348             }
349         }
350         else
351         if (bomEnc.equals(UTF_8)) {
352             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
353                 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
354                                              bomEnc,xmlGuessEnc,xmlEnc,is);
355             }
356             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
357                 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
358                                              bomEnc,xmlGuessEnc,xmlEnc,is);
359             }
360             encoding = UTF_8;
361         }
362         else
363         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
364             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
365                 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
366             }
367             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
368                 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
369                                              bomEnc,xmlGuessEnc,xmlEnc,is);
370             }
371             encoding =bomEnc;
372         }
373         else {
374             throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
375                                          bomEnc,xmlGuessEnc,xmlEnc,is);
376         }
377         return encoding;
378     }
379 
380     // InputStream is passed for XmlReaderException creation only
381     private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException {
382         String encoding;
383         if (lenient & xmlEnc!=null) {
384             encoding = xmlEnc;
385         }
386         else {
387             boolean appXml = isAppXml(cTMime);
388             boolean textXml = isTextXml(cTMime);
389             if (appXml || textXml) {
390                 if (cTEnc==null) {
391                     if (appXml) {
392                         encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
393                     }
394                     else {
395                         encoding = US_ASCII;
396                     }
397                 }
398                 else
399                 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
400                     throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
401                                                  cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
402                 }
403                 else
404                 if (cTEnc.equals(UTF_16)) {
405                     if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
406                         encoding = bomEnc;
407                     }
408                     else {
409                         throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
410                                                      cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
411                     }
412                 }
413                 else {
414                     encoding = cTEnc;
415                 }
416             }
417             else {
418                 throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
419                                              cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
420             }
421         }
422         return encoding;
423     }
424 
425     // returns MIME type or NULL if httpContentType is NULL
426     private static String getContentTypeMime(String httpContentType) {
427         String mime = null;
428         if (httpContentType!=null) {
429             int i = httpContentType.indexOf(";");
430             mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
431         }
432         return mime;
433     }
434 
435     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
436 
437     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
438     private static String getContentTypeEncoding(String httpContentType) {
439         String encoding = null;
440         if (httpContentType!=null) {
441             int i = httpContentType.indexOf(";");
442             if (i>-1) {
443                 String postMime = httpContentType.substring(i+1);
444                 Matcher m = CHARSET_PATTERN.matcher(postMime);
445                 encoding = (m.find()) ? m.group(1) : null;
446                 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
447             }
448         }
449         return encoding;
450     }
451 
452     // returns the BOM in the stream, NULL if not present,
453     // if there was BOM the in the stream it is consumed
454     private static String getBOMEncoding(PushbackInputStream is) throws IOException {
455         String encoding = null;
456         int[] bytes = new int[3];
457         bytes[0] = is.read();
458         bytes[1] = is.read();
459         bytes[2] = is.read();
460 
461         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
462             encoding = UTF_16BE;
463             is.unread(bytes[2]);
464         }
465         else
466         if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
467             encoding = UTF_16LE;
468             is.unread(bytes[2]);
469         }
470         else
471         if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
472             encoding = UTF_8;
473         }
474         else {
475             for (int i=bytes.length-1;i>=0;i--) {
476                 is.unread(bytes[i]);
477             }
478         }
479         return encoding;
480     }
481 
482     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
483     private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
484         String encoding = null;
485         int[] bytes = new int[4];
486         bytes[0] = is.read();
487         bytes[1] = is.read();
488         bytes[2] = is.read();
489         bytes[3] = is.read();
490         for (int i=bytes.length-1;i>=0;i--) {
491             is.unread(bytes[i]);
492         }
493 
494         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
495                 encoding = UTF_16BE;
496         }
497         else
498         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
499                 encoding = UTF_16LE;
500         }
501         else
502         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
503             encoding = UTF_8;
504         }
505         return encoding;
506     }
507 
508 
509     private static final Pattern ENCODING_PATTERN =
510         Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*')).*\\?>", Pattern.MULTILINE);
511 
512     // returns the encoding declared in the <?xml encoding=...?>,  NULL if none
513     private static String getXmlProlog(PushbackInputStream is,String guessedEnc) throws IOException {
514         String encoding = null;
515         if (guessedEnc!=null) {
516             byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
517             int offset = 0;
518             int max = PUSHBACK_MAX_SIZE;
519             int c = is.read(bytes,offset,max);
520             while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
521                 offset += c;
522                 max -= c;
523                 c = is.read(bytes,offset,max);
524             }
525             int bytesRead = offset;
526             if (bytesRead>0) {
527                 is.unread(bytes,0,bytesRead);
528                 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
529                 BufferedReader br = new BufferedReader(reader);
530                 StringBuffer prolog = new StringBuffer(PUSHBACK_MAX_SIZE);
531                 String line = br.readLine();
532                 while (line != null) {
533                   prolog.append(line).append("\n");
534                   line = br.readLine();
535                 }
536                 Matcher m = ENCODING_PATTERN.matcher(prolog);
537                 if (m.find()) {
538                     encoding = m.group(1).toUpperCase();
539                     encoding = encoding.substring(1,encoding.length()-1);
540                 }
541             }
542         }
543         return encoding;
544     }
545 
546     // indicates if the MIME type belongs to the APPLICATION XML family
547     private static boolean isAppXml(String mime) {
548         return mime!=null &&
549                (mime.equals("application/xml") ||
550                 mime.equals("application/xml-dtd") ||
551                 mime.equals("application/xml-external-parsed-entity") ||
552                 (mime.startsWith("application/") && mime.endsWith("+xml")));
553     }
554 
555     // indicates if the MIME type belongs to the TEXT XML family
556     private static boolean isTextXml(String mime) {
557         return mime!=null &&
558                (mime.equals("text/xml") ||
559                 mime.equals("text/xml-external-parsed-entity") ||
560                 (mime.startsWith("text/") && mime.endsWith("+xml")));
561     }
562 
563     private static final MessageFormat RAW_EX_1 = new MessageFormat(
564             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
565 
566     private static final MessageFormat RAW_EX_2 = new MessageFormat(
567             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
568 
569     private static final MessageFormat HTTP_EX_1 = new MessageFormat(
570             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
571 
572     private static final MessageFormat HTTP_EX_2 = new MessageFormat(
573             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
574 
575     private static final MessageFormat HTTP_EX_3 = new MessageFormat(
576             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
577 
578 }