View Javadoc

1   /*
2    * Copyright 2005 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io.impl;
18  
19  import java.io.IOException;
20  import java.io.Reader;
21  import java.io.InputStreamReader;
22  import java.io.BufferedReader;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.regex.Pattern;
26  import java.util.regex.Matcher;
27  import java.net.URL;
28  
29  /**
30   * @author Alejandro Abdelnur
31   */
32  public class XmlFixerReader extends Reader {
33  
34      public static void main(String[] args) throws Exception {
35          Reader r = new InputStreamReader(new URL(args[0]).openStream());
36          r = new XmlFixerReader(r);
37          BufferedReader br = new BufferedReader(r);
38          String l = br.readLine();
39          while (l!=null) {
40              System.out.println(l);
41              l = br.readLine();
42          }
43      }
44  
45      protected Reader in;
46  
47      public XmlFixerReader(Reader in) {
48          super(in);
49          this.in = in;
50          _buffer = new StringBuffer();
51          _state = 0;
52      }
53  
54      private boolean trimmed;
55      private StringBuffer _buffer;
56      private int _bufferPos;
57      private int _state = 0;
58  
59      private boolean trimStream() throws IOException {
60          boolean hasContent = true;
61          int state = 0;
62          boolean loop;
63          int c;
64          do {
65              switch (state) {
66                  case 0:
67                      c = in.read();
68                      if (c==-1) {
69                          loop = false;
70                          hasContent = false;
71                      }
72                      else
73                      if (c==' ' || c=='\n') {
74                          loop = true;
75                      }
76                      else
77                      if (c=='<') {
78                          state = 1;
79                          _buffer.setLength(0);
80                          _bufferPos = 0;
81                          _buffer.append((char)c);
82                          loop = true;
83                      }
84                      else {
85                          _buffer.setLength(0);
86                          _bufferPos = 0;
87                          _buffer.append((char)c);
88                          loop = false;
89                          hasContent = true;
90                          _state = 3;
91                      }
92                      break;
93                  case 1:
94                      c = in.read();
95                      if (c==-1) {
96                          loop = false;
97                          hasContent = true;
98                          _state = 3;
99                      }
100                     else
101                     if (c!='!') {
102                         _buffer.append((char)c);
103                         _state = 3;
104                         loop = false;
105                         hasContent = true;
106                         _state = 3;
107                     }
108                     else {
109                         _buffer.append((char)c);
110                         state = 2;
111                         loop = true;
112                     }
113                     break;
114                 case 2:
115                     c = in.read();
116                     if (c==-1) {
117                         loop = false;
118                         hasContent = true;
119                         _state = 3;
120                     }
121                     else
122                     if (c=='-') {
123                         _buffer.append((char)c);
124                         state = 3;
125                         loop = true;
126                     }
127                     else {
128                         _buffer.append((char)c);
129                         loop = false;
130                         hasContent = true;
131                         _state = 3;
132                     }
133                     break;
134                 case 3:
135                     c = in.read();
136                     if (c==-1) {
137                         loop = false;
138                         hasContent = true;
139                         _state = 3;
140                     }
141                     else
142                     if (c=='-') {
143                         _buffer.append((char)c);
144                         state = 4;
145                         loop = true;
146                     }
147                     else {
148                         _buffer.append((char)c);
149                         loop = false;
150                         hasContent = true;
151                         _state = 3;
152                     }
153                     break;
154                 case 4:
155                     c = in.read();
156                     if (c==-1) {
157                         loop = false;
158                         hasContent = true;
159                         _state = 3;
160                     }
161                     else
162                     if (c!='-') {
163                         _buffer.append((char)c);
164                         loop = true;
165                     }
166                     else {
167                         _buffer.append((char)c);
168                         state = 5;
169                         loop = true;
170                     }
171                     break;
172                 case 5:
173                     c = in.read();
174                     if (c==-1) {
175                         loop = false;
176                         hasContent = true;
177                         _state = 3;
178                     }
179                     else
180                     if (c!='-') {
181                         _buffer.append((char)c);
182                         loop = true;
183                         state = 4;
184                     }
185                     else {
186                         _buffer.append((char)c);
187                         state = 6;
188                         loop = true;
189                     }
190                     break;
191                 case 6:
192                     c = in.read();
193                     if (c==-1) {
194                         loop = false;
195                         hasContent = true;
196                         _state = 3;
197                     }
198                     else
199                     if (c!='>') {
200                         _buffer.append((char)c);
201                         loop = true;
202                         state = 4;
203                     }
204                     else {
205                         _buffer.setLength(0);
206                         state = 0;
207                         loop = true;
208                     }
209                     break;
210                 default:
211                     throw new IOException("It shouldn't happen");
212             }
213         } while (loop);
214         return hasContent;
215     }
216 
217     public int read() throws IOException {
218         boolean loop;
219         if (!trimmed) { // trims XML stream
220             trimmed = true;
221             if (!trimStream()) {
222                 return -1;
223             }
224         }
225         int c;
226         do { // converts literal entities to coded entities
227             switch (_state) {
228                 case 0: // reading chars from stream
229                     c = in.read();
230                     if (c>-1) {
231                         if (c=='&') {
232                             _state = 1;
233                             _buffer.setLength(0);
234                             _bufferPos = 0;
235                             _buffer.append((char)c);
236                             _state = 1;
237                             loop = true;
238                         }
239                         else {
240                             loop = false;
241                         }
242                     }
243                     else {
244                         loop = false;
245                     }
246                     break;
247                 case 1: // reading entity from stream
248                     c = in.read();
249                     if (c>-1) {
250                         if (c==';') {
251                             _buffer.append((char)c);
252                             _state = 2;
253                             loop = true;
254                         }
255                         else
256                         if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
257                             _buffer.append((char)c);
258                             loop = true;
259                         }
260                         else {
261                             _buffer.append((char)c);
262                             _state = 3;
263                             loop = true;
264                         }
265                     }
266                     else {
267                         _state = 3;
268                         loop = true;
269                     }
270                     break;
271                 case 2: // replacing entity
272                     c = 0;
273                     String literalEntity = _buffer.toString();
274                     String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
275                     if (codedEntity!=null) {
276                         _buffer.setLength(0);
277                         _buffer.append(codedEntity);
278                     } // else we leave what was in the stream
279                     _state = 3;
280                     loop = true;
281                     break;
282                 case 3: // consuming buffer
283                     if (_bufferPos<_buffer.length()) {
284                         c = _buffer.charAt(_bufferPos++);
285                         loop = false;
286                     }
287                     else {
288                         c = 0;
289                         _state = 0;
290                         loop = true;
291                     }
292                     break;
293                  default:
294                     throw new IOException("It shouldn't happen");
295             }
296         } while (loop);
297         return c;
298     }
299 
300     public int read(char[] buffer,int offset,int len) throws IOException {
301         int charsRead = 0;
302         int c = read();
303         if (c==-1) {
304             return -1;
305         }
306         buffer[offset+(charsRead++)] = (char) c;
307         while (charsRead<len && (c=read())>-1) {
308             buffer[offset+(charsRead++)] = (char) c;
309         }
310         return charsRead;
311     }
312 
313     public long skip(long n) throws IOException {
314         if (n==0) {
315             return 0;
316         }
317         else
318         if (n<0) {
319             throw new IllegalArgumentException("'n' cannot be negative");
320         }
321         int c = read();
322         long counter = 1;
323         while (c>-1 && counter<n) {
324             c = read();
325             counter++;
326         }
327         return counter;
328     }
329 
330     public boolean ready() throws IOException {
331         return (_state!=0) || in.ready();
332     }
333 
334     public boolean markSupported() {
335         return false;
336     }
337 
338     public void mark(int readAheadLimit) throws IOException {
339         throw new IOException("Stream does not support mark");
340     }
341 
342     public void reset() throws IOException {
343         throw new IOException("Stream does not support mark");
344     }
345 
346     public void close() throws IOException {
347         in.close();
348     }
349 
350     private static Map CODED_ENTITIES = new HashMap();
351 
352     static {
353         // note: refer to Character entity references in HTML 4
354         // at http://www.w3.org/TR/REC-html40/sgml/entities.html
355             	
356     	// Character entity set.
357     	// HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"
358     
359     	CODED_ENTITIES.put("&nbsp;",  "&#160;");
360         CODED_ENTITIES.put("&iexcl;", "&#161;");
361         CODED_ENTITIES.put("&cent;",  "&#162;");
362         CODED_ENTITIES.put("&pound;", "&#163;");
363         CODED_ENTITIES.put("&curren;","&#164;");
364         CODED_ENTITIES.put("&yen;",   "&#165;");
365         CODED_ENTITIES.put("&brvbar;","&#166;");
366         CODED_ENTITIES.put("&sect;",  "&#167;");
367         CODED_ENTITIES.put("&uml;",   "&#168;");
368         CODED_ENTITIES.put("&copy;",  "&#169;");
369         CODED_ENTITIES.put("&ordf;",  "&#170;");
370         CODED_ENTITIES.put("&laquo;", "&#171;");
371         CODED_ENTITIES.put("&not;",   "&#172;");
372         CODED_ENTITIES.put("&shy;",   "&#173;");
373         CODED_ENTITIES.put("&reg;",   "&#174;");
374         CODED_ENTITIES.put("&macr;",  "&#175;");
375         CODED_ENTITIES.put("&deg;",   "&#176;");
376         CODED_ENTITIES.put("&plusmn;","&#177;");
377         CODED_ENTITIES.put("&sup2;",  "&#178;");
378         CODED_ENTITIES.put("&sup3;",  "&#179;");
379         CODED_ENTITIES.put("&acute;", "&#180;");
380         CODED_ENTITIES.put("&micro;", "&#181;");
381         CODED_ENTITIES.put("&para;",  "&#182;");
382         CODED_ENTITIES.put("&middot;","&#183;");
383         CODED_ENTITIES.put("&cedil;", "&#184;");
384         CODED_ENTITIES.put("&sup1;",  "&#185;");
385         CODED_ENTITIES.put("&ordm;",  "&#186;");
386         CODED_ENTITIES.put("&raquo;", "&#187;");
387         CODED_ENTITIES.put("&frac14;","&#188;");
388         CODED_ENTITIES.put("&frac12;","&#189;");
389         CODED_ENTITIES.put("&frac34;","&#190;");
390         CODED_ENTITIES.put("&iquest;","&#191;");
391         CODED_ENTITIES.put("&Agrave;","&#192;");
392         CODED_ENTITIES.put("&Aacute;","&#193;");
393         CODED_ENTITIES.put("&Acirc;", "&#194;");
394         CODED_ENTITIES.put("&Atilde;","&#195;");
395         CODED_ENTITIES.put("&Auml;",  "&#196;");
396         CODED_ENTITIES.put("&Aring;", "&#197;");
397         CODED_ENTITIES.put("&AElig;", "&#198;");
398         CODED_ENTITIES.put("&Ccedil;","&#199;");
399         CODED_ENTITIES.put("&Egrave;","&#200;");
400         CODED_ENTITIES.put("&Eacute;","&#201;");
401         CODED_ENTITIES.put("&Ecirc;", "&#202;");
402         CODED_ENTITIES.put("&Euml;",  "&#203;");
403         CODED_ENTITIES.put("&Igrave;","&#204;");
404         CODED_ENTITIES.put("&Iacute;","&#205;");
405         CODED_ENTITIES.put("&Icirc;", "&#206;");
406         CODED_ENTITIES.put("&Iuml;",  "&#207;");
407         CODED_ENTITIES.put("&ETH;",   "&#208;");
408         CODED_ENTITIES.put("&Ntilde;","&#209;");
409         CODED_ENTITIES.put("&Ograve;","&#210;");
410         CODED_ENTITIES.put("&Oacute;","&#211;");
411         CODED_ENTITIES.put("&Ocirc;", "&#212;");
412         CODED_ENTITIES.put("&Otilde;","&#213;");
413         CODED_ENTITIES.put("&Ouml;",  "&#214;");
414         CODED_ENTITIES.put("&times;", "&#215;");
415         CODED_ENTITIES.put("&Oslash;","&#216;");
416         CODED_ENTITIES.put("&Ugrave;","&#217;");
417         CODED_ENTITIES.put("&Uacute;","&#218;");
418         CODED_ENTITIES.put("&Ucirc;", "&#219;");
419         CODED_ENTITIES.put("&Uuml;",  "&#220;");
420         CODED_ENTITIES.put("&Yacute;","&#221;");
421         CODED_ENTITIES.put("&THORN;", "&#222;");
422         CODED_ENTITIES.put("&szlig;", "&#223;");
423         CODED_ENTITIES.put("&agrave;","&#224;");
424         CODED_ENTITIES.put("&aacute;","&#225;");
425         CODED_ENTITIES.put("&acirc;", "&#226;");
426         CODED_ENTITIES.put("&atilde;","&#227;");
427         CODED_ENTITIES.put("&auml;",  "&#228;");
428         CODED_ENTITIES.put("&aring;", "&#229;");
429         CODED_ENTITIES.put("&aelig;", "&#230;");
430         CODED_ENTITIES.put("&ccedil;","&#231;");
431         CODED_ENTITIES.put("&egrave;","&#232;");
432         CODED_ENTITIES.put("&eacute;","&#233;");
433         CODED_ENTITIES.put("&ecirc;", "&#234;");
434         CODED_ENTITIES.put("&euml;",  "&#235;");
435         CODED_ENTITIES.put("&igrave;","&#236;");
436         CODED_ENTITIES.put("&iacute;","&#237;");
437         CODED_ENTITIES.put("&icirc;", "&#238;");
438         CODED_ENTITIES.put("&iuml;",  "&#239;");
439         CODED_ENTITIES.put("&eth;",   "&#240;");
440         CODED_ENTITIES.put("&ntilde;","&#241;");
441         CODED_ENTITIES.put("&ograve;","&#242;");
442         CODED_ENTITIES.put("&oacute;","&#243;");
443         CODED_ENTITIES.put("&ocirc;", "&#244;");
444         CODED_ENTITIES.put("&otilde;","&#245;");
445         CODED_ENTITIES.put("&ouml;",  "&#246;");
446         CODED_ENTITIES.put("&divide;","&#247;");
447         CODED_ENTITIES.put("&oslash;","&#248;");
448         CODED_ENTITIES.put("&ugrave;","&#249;");
449         CODED_ENTITIES.put("&uacute;","&#250;");
450         CODED_ENTITIES.put("&ucirc;", "&#251;");
451         CODED_ENTITIES.put("&uuml;",  "&#252;");
452         CODED_ENTITIES.put("&yacute;","&#253;");
453         CODED_ENTITIES.put("&thorn;", "&#254;");
454         CODED_ENTITIES.put("&yuml;",  "&#255;");
455         
456         // Mathematical, Greek and Symbolic characters for HTML.
457         // HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML" 
458         
459         CODED_ENTITIES.put("&fnof;",     "&#402;");
460         CODED_ENTITIES.put("&Alpha;",    "&#913;");
461         CODED_ENTITIES.put("&Beta;",     "&#914;");
462         CODED_ENTITIES.put("&Gamma;",    "&#915;");
463         CODED_ENTITIES.put("&Delta;",    "&#916;");
464         CODED_ENTITIES.put("&Epsilon;",  "&#917;");
465         CODED_ENTITIES.put("&Zeta;",     "&#918;");
466         CODED_ENTITIES.put("&Eta;",      "&#919;");
467         CODED_ENTITIES.put("&Theta;",    "&#920;");
468         CODED_ENTITIES.put("&Iota;",     "&#921;");
469         CODED_ENTITIES.put("&Kappa;",    "&#922;");
470         CODED_ENTITIES.put("&Lambda;",   "&#923;");
471         CODED_ENTITIES.put("&Mu;",       "&#924;");
472         CODED_ENTITIES.put("&Nu;",       "&#925;");
473         CODED_ENTITIES.put("&Xi;",       "&#926;");
474         CODED_ENTITIES.put("&Omicron;",  "&#927;");
475         CODED_ENTITIES.put("&Pi;",       "&#928;");
476         CODED_ENTITIES.put("&Rho;",      "&#929;");
477         CODED_ENTITIES.put("&Sigma;",    "&#931;");
478         CODED_ENTITIES.put("&Tau;",      "&#932;");
479         CODED_ENTITIES.put("&Upsilon;",  "&#933;");
480         CODED_ENTITIES.put("&Phi;",      "&#934;");
481         CODED_ENTITIES.put("&Chi;",      "&#935;");
482         CODED_ENTITIES.put("&Psi;",      "&#936;");
483         CODED_ENTITIES.put("&Omega;",    "&#937;");
484         CODED_ENTITIES.put("&alpha;",    "&#945;");
485         CODED_ENTITIES.put("&beta;",     "&#946;");
486         CODED_ENTITIES.put("&gamma;",    "&#947;");
487         CODED_ENTITIES.put("&delta;",    "&#948;");
488         CODED_ENTITIES.put("&epsilon;",  "&#949;");
489         CODED_ENTITIES.put("&zeta;",     "&#950;");
490         CODED_ENTITIES.put("&eta;",      "&#951;");
491         CODED_ENTITIES.put("&theta;",    "&#952;");
492         CODED_ENTITIES.put("&iota;",     "&#953;");
493         CODED_ENTITIES.put("&kappa;",    "&#954;");
494         CODED_ENTITIES.put("&lambda;",   "&#955;");
495         CODED_ENTITIES.put("&mu;",       "&#956;");
496         CODED_ENTITIES.put("&nu;",       "&#957;");
497         CODED_ENTITIES.put("&xi;",       "&#958;");
498         CODED_ENTITIES.put("&omicron;",  "&#959;");
499         CODED_ENTITIES.put("&pi;",       "&#960;");
500         CODED_ENTITIES.put("&rho;",      "&#961;");
501         CODED_ENTITIES.put("&sigmaf;",   "&#962;");
502         CODED_ENTITIES.put("&sigma;",    "&#963;");
503         CODED_ENTITIES.put("&tau;",      "&#964;");
504         CODED_ENTITIES.put("&upsilon;",  "&#965;");
505         CODED_ENTITIES.put("&phi;",      "&#966;");
506         CODED_ENTITIES.put("&chi;",      "&#967;");
507         CODED_ENTITIES.put("&psi;",      "&#968;");
508         CODED_ENTITIES.put("&omega;",    "&#969;");
509         CODED_ENTITIES.put("&thetasym;", "&#977;");
510         CODED_ENTITIES.put("&upsih;",    "&#978;");
511         CODED_ENTITIES.put("&piv;",      "&#982;");
512         CODED_ENTITIES.put("&bull;",     "&#8226;");
513         CODED_ENTITIES.put("&hellip;",   "&#8230;");
514         CODED_ENTITIES.put("&prime;",    "&#8242;");
515         CODED_ENTITIES.put("&Prime;",    "&#8243;");
516         CODED_ENTITIES.put("&oline;",    "&#8254;");
517         CODED_ENTITIES.put("&frasl;",    "&#8260;");
518         CODED_ENTITIES.put("&weierp;",   "&#8472;");
519         CODED_ENTITIES.put("&image;",    "&#8465;");
520         CODED_ENTITIES.put("&real;",     "&#8476;");
521         CODED_ENTITIES.put("&trade;",    "&#8482;");
522         CODED_ENTITIES.put("&alefsym;",  "&#8501;");
523         CODED_ENTITIES.put("&larr;",     "&#8592;");
524         CODED_ENTITIES.put("&uarr;",     "&#8593;");
525         CODED_ENTITIES.put("&rarr;",     "&#8594;");
526         CODED_ENTITIES.put("&darr;",     "&#8595;");
527         CODED_ENTITIES.put("&harr;",     "&#8596;");
528         CODED_ENTITIES.put("&crarr;",    "&#8629;");
529         CODED_ENTITIES.put("&lArr;",     "&#8656;");
530         CODED_ENTITIES.put("&uArr;",     "&#8657;");
531         CODED_ENTITIES.put("&rArr;",     "&#8658;");
532         CODED_ENTITIES.put("&dArr;",     "&#8659;");
533         CODED_ENTITIES.put("&hArr;",     "&#8660;");
534         CODED_ENTITIES.put("&forall;",   "&#8704;");
535         CODED_ENTITIES.put("&part;",     "&#8706;");
536         CODED_ENTITIES.put("&exist;",    "&#8707;");
537         CODED_ENTITIES.put("&empty;",    "&#8709;");
538         CODED_ENTITIES.put("&nabla;",    "&#8711;");
539         CODED_ENTITIES.put("&isin;",     "&#8712;");
540         CODED_ENTITIES.put("&notin;",    "&#8713;");
541         CODED_ENTITIES.put("&ni;",       "&#8715;");
542         CODED_ENTITIES.put("&prod;",     "&#8719;");
543         CODED_ENTITIES.put("&sum;",      "&#8721;");
544         CODED_ENTITIES.put("&minus;",    "&#8722;");
545         CODED_ENTITIES.put("&lowast;",   "&#8727;");
546         CODED_ENTITIES.put("&radic;",    "&#8730;");
547         CODED_ENTITIES.put("&prop;",     "&#8733;");
548         CODED_ENTITIES.put("&infin;",    "&#8734;");
549         CODED_ENTITIES.put("&ang;",      "&#8736;");
550         CODED_ENTITIES.put("&and;",      "&#8743;");
551         CODED_ENTITIES.put("&or;",       "&#8744;");
552         CODED_ENTITIES.put("&cap;",      "&#8745;");
553         CODED_ENTITIES.put("&cup;",      "&#8746;");
554         CODED_ENTITIES.put("&int;",      "&#8747;");
555         CODED_ENTITIES.put("&there4;",   "&#8756;");
556         CODED_ENTITIES.put("&sim;",      "&#8764;");
557         CODED_ENTITIES.put("&cong;",     "&#8773;");
558         CODED_ENTITIES.put("&asymp;",    "&#8776;");
559         CODED_ENTITIES.put("&ne;",       "&#8800;");
560         CODED_ENTITIES.put("&equiv;",    "&#8801;");
561         CODED_ENTITIES.put("&le;",       "&#8804;");
562         CODED_ENTITIES.put("&ge;",       "&#8805;");
563         CODED_ENTITIES.put("&sub;",      "&#8834;");
564         CODED_ENTITIES.put("&sup;",      "&#8835;");
565         CODED_ENTITIES.put("&nsub;",     "&#8836;");
566         CODED_ENTITIES.put("&sube;",     "&#8838;");
567         CODED_ENTITIES.put("&supe;",     "&#8839;");
568         CODED_ENTITIES.put("&oplus;",    "&#8853;");
569         CODED_ENTITIES.put("&otimes;",   "&#8855;");
570         CODED_ENTITIES.put("&perp;",     "&#8869;");
571         CODED_ENTITIES.put("&sdot;",     "&#8901;");
572         CODED_ENTITIES.put("&lceil;",    "&#8968;");
573         CODED_ENTITIES.put("&rceil;",    "&#8969;");
574         CODED_ENTITIES.put("&lfloor;",   "&#8970;");
575         CODED_ENTITIES.put("&rfloor;",   "&#8971;");
576         CODED_ENTITIES.put("&lang;",     "&#9001;");
577         CODED_ENTITIES.put("&rang;",     "&#9002;");
578         CODED_ENTITIES.put("&loz;",      "&#9674;");
579         CODED_ENTITIES.put("&spades;",   "&#9824;");
580         CODED_ENTITIES.put("&clubs;",    "&#9827;");
581         CODED_ENTITIES.put("&hearts;",   "&#9829;");
582         CODED_ENTITIES.put("&diams;",    "&#9830;");
583          
584        // Special characters for HTML.
585        // HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"
586         
587         CODED_ENTITIES.put("&quot;",      "&#34;");
588         CODED_ENTITIES.put("&amp;",       "&#38;");
589         CODED_ENTITIES.put("&lt;",        "&#60;");
590         CODED_ENTITIES.put("&gt;",        "&#62;");
591         CODED_ENTITIES.put("&OElig;",     "&#338;");
592         CODED_ENTITIES.put("&oelig;",     "&#339;");
593         CODED_ENTITIES.put("&Scaron;",    "&#352;");
594         CODED_ENTITIES.put("&scaron;",    "&#353;");
595         CODED_ENTITIES.put("&Yuml;",      "&#376;");
596         CODED_ENTITIES.put("&circ;",      "&#710;");
597         CODED_ENTITIES.put("&tilde;",     "&#732;");
598         CODED_ENTITIES.put("&ensp;",      "&#8194;");
599         CODED_ENTITIES.put("&emsp;",      "&#8195;");
600         CODED_ENTITIES.put("&thinsp;",    "&#8201;");
601         CODED_ENTITIES.put("&zwnj;",      "&#8204;");
602         CODED_ENTITIES.put("&zwj;",       "&#8205;");
603         CODED_ENTITIES.put("&lrm;",       "&#8206;");
604         CODED_ENTITIES.put("&rlm;",       "&#8207;");
605         CODED_ENTITIES.put("&ndash;",     "&#8211;");
606         CODED_ENTITIES.put("&mdash;",     "&#8212;");
607         CODED_ENTITIES.put("&lsquo;",     "&#8216;");
608         CODED_ENTITIES.put("&rsquo;",     "&#8217;");
609         CODED_ENTITIES.put("&sbquo;",     "&#8218;");
610         CODED_ENTITIES.put("&ldquo;",     "&#8220;");
611         CODED_ENTITIES.put("&rdquo;",     "&#8221;");
612         CODED_ENTITIES.put("&bdquo;",     "&#8222;");
613         CODED_ENTITIES.put("&dagger;",    "&#8224;");
614         CODED_ENTITIES.put("&Dagger;",    "&#8225;");
615         CODED_ENTITIES.put("&permil;",    "&#8240;");
616         CODED_ENTITIES.put("&lsaquo;",    "&#8249;");
617         CODED_ENTITIES.put("&rsaquo;",    "&#8250;");
618         CODED_ENTITIES.put("&euro;",      "&#8364;");       
619     }
620 
621     //
622     // It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
623     //
624 
625     private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
626 
627 
628     public String processHtmlEntities(String s) {
629         if (s.indexOf('&')==-1) {
630             return s;
631         }
632         StringBuffer sb = new StringBuffer(s.length());
633         int pos = 0;
634         while (pos<s.length()) {
635             String chunck = s.substring(pos);
636             Matcher m = ENTITIES_PATTERN.matcher(chunck);
637             if (m.find()) {
638                 int b = pos + m.start();
639                 int e = pos + m.end();
640                 if (b>pos) {
641                     sb.append(s.substring(pos,b));
642                     pos = b;
643                 }
644                 chunck = s.substring(pos,e);
645                 String codedEntity = (String) CODED_ENTITIES.get(chunck);
646                 if (codedEntity==null) {
647                     codedEntity = chunck;
648                 }
649                 sb.append(codedEntity);
650                 pos = e;
651             }
652             else {
653                 sb.append(chunck);
654                 pos += chunck.length();
655             }
656         }
657         return sb.toString();
658     }
659 
660 }