1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io.impl;
18
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.io.InputStreamReader;
22 import java.io.BufferedReader;
23 import java.util.HashMap;
24 import java.util.Map;
25 import java.util.regex.Pattern;
26 import java.util.regex.Matcher;
27 import java.net.URL;
28
29
30
31
32 public class XmlFixerReader extends Reader {
33
34 public static void main(String[] args) throws Exception {
35 Reader r = new InputStreamReader(new URL(args[0]).openStream());
36 r = new XmlFixerReader(r);
37 BufferedReader br = new BufferedReader(r);
38 String l = br.readLine();
39 while (l!=null) {
40 System.out.println(l);
41 l = br.readLine();
42 }
43 }
44
45 protected Reader in;
46
47 public XmlFixerReader(Reader in) {
48 super(in);
49 this.in = in;
50 _buffer = new StringBuffer();
51 _state = 0;
52 }
53
54 private boolean trimmed;
55 private StringBuffer _buffer;
56 private int _bufferPos;
57 private int _state = 0;
58
59 private boolean trimStream() throws IOException {
60 boolean hasContent = true;
61 int state = 0;
62 boolean loop;
63 int c;
64 do {
65 switch (state) {
66 case 0:
67 c = in.read();
68 if (c==-1) {
69 loop = false;
70 hasContent = false;
71 }
72 else
73 if (c==' ' || c=='\n') {
74 loop = true;
75 }
76 else
77 if (c=='<') {
78 state = 1;
79 _buffer.setLength(0);
80 _bufferPos = 0;
81 _buffer.append((char)c);
82 loop = true;
83 }
84 else {
85 _buffer.setLength(0);
86 _bufferPos = 0;
87 _buffer.append((char)c);
88 loop = false;
89 hasContent = true;
90 _state = 3;
91 }
92 break;
93 case 1:
94 c = in.read();
95 if (c==-1) {
96 loop = false;
97 hasContent = true;
98 _state = 3;
99 }
100 else
101 if (c!='!') {
102 _buffer.append((char)c);
103 _state = 3;
104 loop = false;
105 hasContent = true;
106 _state = 3;
107 }
108 else {
109 _buffer.append((char)c);
110 state = 2;
111 loop = true;
112 }
113 break;
114 case 2:
115 c = in.read();
116 if (c==-1) {
117 loop = false;
118 hasContent = true;
119 _state = 3;
120 }
121 else
122 if (c=='-') {
123 _buffer.append((char)c);
124 state = 3;
125 loop = true;
126 }
127 else {
128 _buffer.append((char)c);
129 loop = false;
130 hasContent = true;
131 _state = 3;
132 }
133 break;
134 case 3:
135 c = in.read();
136 if (c==-1) {
137 loop = false;
138 hasContent = true;
139 _state = 3;
140 }
141 else
142 if (c=='-') {
143 _buffer.append((char)c);
144 state = 4;
145 loop = true;
146 }
147 else {
148 _buffer.append((char)c);
149 loop = false;
150 hasContent = true;
151 _state = 3;
152 }
153 break;
154 case 4:
155 c = in.read();
156 if (c==-1) {
157 loop = false;
158 hasContent = true;
159 _state = 3;
160 }
161 else
162 if (c!='-') {
163 _buffer.append((char)c);
164 loop = true;
165 }
166 else {
167 _buffer.append((char)c);
168 state = 5;
169 loop = true;
170 }
171 break;
172 case 5:
173 c = in.read();
174 if (c==-1) {
175 loop = false;
176 hasContent = true;
177 _state = 3;
178 }
179 else
180 if (c!='-') {
181 _buffer.append((char)c);
182 loop = true;
183 state = 4;
184 }
185 else {
186 _buffer.append((char)c);
187 state = 6;
188 loop = true;
189 }
190 break;
191 case 6:
192 c = in.read();
193 if (c==-1) {
194 loop = false;
195 hasContent = true;
196 _state = 3;
197 }
198 else
199 if (c!='>') {
200 _buffer.append((char)c);
201 loop = true;
202 state = 4;
203 }
204 else {
205 _buffer.setLength(0);
206 state = 0;
207 loop = true;
208 }
209 break;
210 default:
211 throw new IOException("It shouldn't happen");
212 }
213 } while (loop);
214 return hasContent;
215 }
216
217 public int read() throws IOException {
218 boolean loop;
219 if (!trimmed) {
220 trimmed = true;
221 if (!trimStream()) {
222 return -1;
223 }
224 }
225 int c;
226 do {
227 switch (_state) {
228 case 0:
229 c = in.read();
230 if (c>-1) {
231 if (c=='&') {
232 _state = 1;
233 _buffer.setLength(0);
234 _bufferPos = 0;
235 _buffer.append((char)c);
236 _state = 1;
237 loop = true;
238 }
239 else {
240 loop = false;
241 }
242 }
243 else {
244 loop = false;
245 }
246 break;
247 case 1:
248 c = in.read();
249 if (c>-1) {
250 if (c==';') {
251 _buffer.append((char)c);
252 _state = 2;
253 loop = true;
254 }
255 else
256 if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
257 _buffer.append((char)c);
258 loop = true;
259 }
260 else {
261 _buffer.append((char)c);
262 _state = 3;
263 loop = true;
264 }
265 }
266 else {
267 _state = 3;
268 loop = true;
269 }
270 break;
271 case 2:
272 c = 0;
273 String literalEntity = _buffer.toString();
274 String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
275 if (codedEntity!=null) {
276 _buffer.setLength(0);
277 _buffer.append(codedEntity);
278 }
279 _state = 3;
280 loop = true;
281 break;
282 case 3:
283 if (_bufferPos<_buffer.length()) {
284 c = _buffer.charAt(_bufferPos++);
285 loop = false;
286 }
287 else {
288 c = 0;
289 _state = 0;
290 loop = true;
291 }
292 break;
293 default:
294 throw new IOException("It shouldn't happen");
295 }
296 } while (loop);
297 return c;
298 }
299
300 public int read(char[] buffer,int offset,int len) throws IOException {
301 int charsRead = 0;
302 int c = read();
303 if (c==-1) {
304 return -1;
305 }
306 buffer[offset+(charsRead++)] = (char) c;
307 while (charsRead<len && (c=read())>-1) {
308 buffer[offset+(charsRead++)] = (char) c;
309 }
310 return charsRead;
311 }
312
313 public long skip(long n) throws IOException {
314 if (n==0) {
315 return 0;
316 }
317 else
318 if (n<0) {
319 throw new IllegalArgumentException("'n' cannot be negative");
320 }
321 int c = read();
322 long counter = 1;
323 while (c>-1 && counter<n) {
324 c = read();
325 counter++;
326 }
327 return counter;
328 }
329
330 public boolean ready() throws IOException {
331 return (_state!=0) || in.ready();
332 }
333
334 public boolean markSupported() {
335 return false;
336 }
337
338 public void mark(int readAheadLimit) throws IOException {
339 throw new IOException("Stream does not support mark");
340 }
341
342 public void reset() throws IOException {
343 throw new IOException("Stream does not support mark");
344 }
345
346 public void close() throws IOException {
347 in.close();
348 }
349
350 private static Map CODED_ENTITIES = new HashMap();
351
352 static {
353
354
355
356
357
358
359 CODED_ENTITIES.put(" ", " ");
360 CODED_ENTITIES.put("¡", "¡");
361 CODED_ENTITIES.put("¢", "¢");
362 CODED_ENTITIES.put("£", "£");
363 CODED_ENTITIES.put("¤","¤");
364 CODED_ENTITIES.put("¥", "¥");
365 CODED_ENTITIES.put("¦","¦");
366 CODED_ENTITIES.put("§", "§");
367 CODED_ENTITIES.put("¨", "¨");
368 CODED_ENTITIES.put("©", "©");
369 CODED_ENTITIES.put("ª", "ª");
370 CODED_ENTITIES.put("«", "«");
371 CODED_ENTITIES.put("¬", "¬");
372 CODED_ENTITIES.put("­", "­");
373 CODED_ENTITIES.put("®", "®");
374 CODED_ENTITIES.put("¯", "¯");
375 CODED_ENTITIES.put("°", "°");
376 CODED_ENTITIES.put("±","±");
377 CODED_ENTITIES.put("²", "²");
378 CODED_ENTITIES.put("³", "³");
379 CODED_ENTITIES.put("´", "´");
380 CODED_ENTITIES.put("µ", "µ");
381 CODED_ENTITIES.put("¶", "¶");
382 CODED_ENTITIES.put("·","·");
383 CODED_ENTITIES.put("¸", "¸");
384 CODED_ENTITIES.put("¹", "¹");
385 CODED_ENTITIES.put("º", "º");
386 CODED_ENTITIES.put("»", "»");
387 CODED_ENTITIES.put("¼","¼");
388 CODED_ENTITIES.put("½","½");
389 CODED_ENTITIES.put("¾","¾");
390 CODED_ENTITIES.put("¿","¿");
391 CODED_ENTITIES.put("À","À");
392 CODED_ENTITIES.put("Á","Á");
393 CODED_ENTITIES.put("Â", "Â");
394 CODED_ENTITIES.put("Ã","Ã");
395 CODED_ENTITIES.put("Ä", "Ä");
396 CODED_ENTITIES.put("Å", "Å");
397 CODED_ENTITIES.put("Æ", "Æ");
398 CODED_ENTITIES.put("Ç","Ç");
399 CODED_ENTITIES.put("È","È");
400 CODED_ENTITIES.put("É","É");
401 CODED_ENTITIES.put("Ê", "Ê");
402 CODED_ENTITIES.put("Ë", "Ë");
403 CODED_ENTITIES.put("Ì","Ì");
404 CODED_ENTITIES.put("Í","Í");
405 CODED_ENTITIES.put("Î", "Î");
406 CODED_ENTITIES.put("Ï", "Ï");
407 CODED_ENTITIES.put("Ð", "Ð");
408 CODED_ENTITIES.put("Ñ","Ñ");
409 CODED_ENTITIES.put("Ò","Ò");
410 CODED_ENTITIES.put("Ó","Ó");
411 CODED_ENTITIES.put("Ô", "Ô");
412 CODED_ENTITIES.put("Õ","Õ");
413 CODED_ENTITIES.put("Ö", "Ö");
414 CODED_ENTITIES.put("×", "×");
415 CODED_ENTITIES.put("Ø","Ø");
416 CODED_ENTITIES.put("Ù","Ù");
417 CODED_ENTITIES.put("Ú","Ú");
418 CODED_ENTITIES.put("Û", "Û");
419 CODED_ENTITIES.put("Ü", "Ü");
420 CODED_ENTITIES.put("Ý","Ý");
421 CODED_ENTITIES.put("Þ", "Þ");
422 CODED_ENTITIES.put("ß", "ß");
423 CODED_ENTITIES.put("à","à");
424 CODED_ENTITIES.put("á","á");
425 CODED_ENTITIES.put("â", "â");
426 CODED_ENTITIES.put("ã","ã");
427 CODED_ENTITIES.put("ä", "ä");
428 CODED_ENTITIES.put("å", "å");
429 CODED_ENTITIES.put("æ", "æ");
430 CODED_ENTITIES.put("ç","ç");
431 CODED_ENTITIES.put("è","è");
432 CODED_ENTITIES.put("é","é");
433 CODED_ENTITIES.put("ê", "ê");
434 CODED_ENTITIES.put("ë", "ë");
435 CODED_ENTITIES.put("ì","ì");
436 CODED_ENTITIES.put("í","í");
437 CODED_ENTITIES.put("î", "î");
438 CODED_ENTITIES.put("ï", "ï");
439 CODED_ENTITIES.put("ð", "ð");
440 CODED_ENTITIES.put("ñ","ñ");
441 CODED_ENTITIES.put("ò","ò");
442 CODED_ENTITIES.put("ó","ó");
443 CODED_ENTITIES.put("ô", "ô");
444 CODED_ENTITIES.put("õ","õ");
445 CODED_ENTITIES.put("ö", "ö");
446 CODED_ENTITIES.put("÷","÷");
447 CODED_ENTITIES.put("ø","ø");
448 CODED_ENTITIES.put("ù","ù");
449 CODED_ENTITIES.put("ú","ú");
450 CODED_ENTITIES.put("û", "û");
451 CODED_ENTITIES.put("ü", "ü");
452 CODED_ENTITIES.put("ý","ý");
453 CODED_ENTITIES.put("þ", "þ");
454 CODED_ENTITIES.put("ÿ", "ÿ");
455
456
457
458
459 CODED_ENTITIES.put("ƒ", "ƒ");
460 CODED_ENTITIES.put("Α", "Α");
461 CODED_ENTITIES.put("Β", "Β");
462 CODED_ENTITIES.put("Γ", "Γ");
463 CODED_ENTITIES.put("Δ", "Δ");
464 CODED_ENTITIES.put("Ε", "Ε");
465 CODED_ENTITIES.put("Ζ", "Ζ");
466 CODED_ENTITIES.put("Η", "Η");
467 CODED_ENTITIES.put("Θ", "Θ");
468 CODED_ENTITIES.put("Ι", "Ι");
469 CODED_ENTITIES.put("Κ", "Κ");
470 CODED_ENTITIES.put("Λ", "Λ");
471 CODED_ENTITIES.put("Μ", "Μ");
472 CODED_ENTITIES.put("Ν", "Ν");
473 CODED_ENTITIES.put("Ξ", "Ξ");
474 CODED_ENTITIES.put("Ο", "Ο");
475 CODED_ENTITIES.put("Π", "Π");
476 CODED_ENTITIES.put("Ρ", "Ρ");
477 CODED_ENTITIES.put("Σ", "Σ");
478 CODED_ENTITIES.put("Τ", "Τ");
479 CODED_ENTITIES.put("Υ", "Υ");
480 CODED_ENTITIES.put("Φ", "Φ");
481 CODED_ENTITIES.put("Χ", "Χ");
482 CODED_ENTITIES.put("Ψ", "Ψ");
483 CODED_ENTITIES.put("Ω", "Ω");
484 CODED_ENTITIES.put("α", "α");
485 CODED_ENTITIES.put("β", "β");
486 CODED_ENTITIES.put("γ", "γ");
487 CODED_ENTITIES.put("δ", "δ");
488 CODED_ENTITIES.put("ε", "ε");
489 CODED_ENTITIES.put("ζ", "ζ");
490 CODED_ENTITIES.put("η", "η");
491 CODED_ENTITIES.put("θ", "θ");
492 CODED_ENTITIES.put("ι", "ι");
493 CODED_ENTITIES.put("κ", "κ");
494 CODED_ENTITIES.put("λ", "λ");
495 CODED_ENTITIES.put("μ", "μ");
496 CODED_ENTITIES.put("ν", "ν");
497 CODED_ENTITIES.put("ξ", "ξ");
498 CODED_ENTITIES.put("ο", "ο");
499 CODED_ENTITIES.put("π", "π");
500 CODED_ENTITIES.put("ρ", "ρ");
501 CODED_ENTITIES.put("ς", "ς");
502 CODED_ENTITIES.put("σ", "σ");
503 CODED_ENTITIES.put("τ", "τ");
504 CODED_ENTITIES.put("υ", "υ");
505 CODED_ENTITIES.put("φ", "φ");
506 CODED_ENTITIES.put("χ", "χ");
507 CODED_ENTITIES.put("ψ", "ψ");
508 CODED_ENTITIES.put("ω", "ω");
509 CODED_ENTITIES.put("ϑ", "ϑ");
510 CODED_ENTITIES.put("ϒ", "ϒ");
511 CODED_ENTITIES.put("ϖ", "ϖ");
512 CODED_ENTITIES.put("•", "•");
513 CODED_ENTITIES.put("…", "…");
514 CODED_ENTITIES.put("′", "′");
515 CODED_ENTITIES.put("″", "″");
516 CODED_ENTITIES.put("‾", "‾");
517 CODED_ENTITIES.put("⁄", "⁄");
518 CODED_ENTITIES.put("℘", "℘");
519 CODED_ENTITIES.put("ℑ", "ℑ");
520 CODED_ENTITIES.put("ℜ", "ℜ");
521 CODED_ENTITIES.put("™", "™");
522 CODED_ENTITIES.put("ℵ", "ℵ");
523 CODED_ENTITIES.put("←", "←");
524 CODED_ENTITIES.put("↑", "↑");
525 CODED_ENTITIES.put("→", "→");
526 CODED_ENTITIES.put("↓", "↓");
527 CODED_ENTITIES.put("↔", "↔");
528 CODED_ENTITIES.put("↵", "↵");
529 CODED_ENTITIES.put("⇐", "⇐");
530 CODED_ENTITIES.put("⇑", "⇑");
531 CODED_ENTITIES.put("⇒", "⇒");
532 CODED_ENTITIES.put("⇓", "⇓");
533 CODED_ENTITIES.put("⇔", "⇔");
534 CODED_ENTITIES.put("∀", "∀");
535 CODED_ENTITIES.put("∂", "∂");
536 CODED_ENTITIES.put("∃", "∃");
537 CODED_ENTITIES.put("∅", "∅");
538 CODED_ENTITIES.put("∇", "∇");
539 CODED_ENTITIES.put("∈", "∈");
540 CODED_ENTITIES.put("∉", "∉");
541 CODED_ENTITIES.put("∋", "∋");
542 CODED_ENTITIES.put("∏", "∏");
543 CODED_ENTITIES.put("∑", "∑");
544 CODED_ENTITIES.put("−", "−");
545 CODED_ENTITIES.put("∗", "∗");
546 CODED_ENTITIES.put("√", "√");
547 CODED_ENTITIES.put("∝", "∝");
548 CODED_ENTITIES.put("∞", "∞");
549 CODED_ENTITIES.put("∠", "∠");
550 CODED_ENTITIES.put("∧", "∧");
551 CODED_ENTITIES.put("∨", "∨");
552 CODED_ENTITIES.put("∩", "∩");
553 CODED_ENTITIES.put("∪", "∪");
554 CODED_ENTITIES.put("∫", "∫");
555 CODED_ENTITIES.put("∴", "∴");
556 CODED_ENTITIES.put("∼", "∼");
557 CODED_ENTITIES.put("≅", "≅");
558 CODED_ENTITIES.put("≈", "≈");
559 CODED_ENTITIES.put("≠", "≠");
560 CODED_ENTITIES.put("≡", "≡");
561 CODED_ENTITIES.put("≤", "≤");
562 CODED_ENTITIES.put("≥", "≥");
563 CODED_ENTITIES.put("⊂", "⊂");
564 CODED_ENTITIES.put("⊃", "⊃");
565 CODED_ENTITIES.put("⊄", "⊄");
566 CODED_ENTITIES.put("⊆", "⊆");
567 CODED_ENTITIES.put("⊇", "⊇");
568 CODED_ENTITIES.put("⊕", "⊕");
569 CODED_ENTITIES.put("⊗", "⊗");
570 CODED_ENTITIES.put("⊥", "⊥");
571 CODED_ENTITIES.put("⋅", "⋅");
572 CODED_ENTITIES.put("⌈", "⌈");
573 CODED_ENTITIES.put("⌉", "⌉");
574 CODED_ENTITIES.put("⌊", "⌊");
575 CODED_ENTITIES.put("⌋", "⌋");
576 CODED_ENTITIES.put("⟨", "〈");
577 CODED_ENTITIES.put("⟩", "〉");
578 CODED_ENTITIES.put("◊", "◊");
579 CODED_ENTITIES.put("♠", "♠");
580 CODED_ENTITIES.put("♣", "♣");
581 CODED_ENTITIES.put("♥", "♥");
582 CODED_ENTITIES.put("♦", "♦");
583
584
585
586
587 CODED_ENTITIES.put(""", """);
588 CODED_ENTITIES.put("&", "&");
589 CODED_ENTITIES.put("<", "<");
590 CODED_ENTITIES.put(">", ">");
591 CODED_ENTITIES.put("Œ", "Œ");
592 CODED_ENTITIES.put("œ", "œ");
593 CODED_ENTITIES.put("Š", "Š");
594 CODED_ENTITIES.put("š", "š");
595 CODED_ENTITIES.put("Ÿ", "Ÿ");
596 CODED_ENTITIES.put("ˆ", "ˆ");
597 CODED_ENTITIES.put("˜", "˜");
598 CODED_ENTITIES.put(" ", " ");
599 CODED_ENTITIES.put(" ", " ");
600 CODED_ENTITIES.put(" ", " ");
601 CODED_ENTITIES.put("‌", "‌");
602 CODED_ENTITIES.put("‍", "‍");
603 CODED_ENTITIES.put("‎", "‎");
604 CODED_ENTITIES.put("‏", "‏");
605 CODED_ENTITIES.put("–", "–");
606 CODED_ENTITIES.put("—", "—");
607 CODED_ENTITIES.put("‘", "‘");
608 CODED_ENTITIES.put("’", "’");
609 CODED_ENTITIES.put("‚", "‚");
610 CODED_ENTITIES.put("“", "“");
611 CODED_ENTITIES.put("”", "”");
612 CODED_ENTITIES.put("„", "„");
613 CODED_ENTITIES.put("†", "†");
614 CODED_ENTITIES.put("‡", "‡");
615 CODED_ENTITIES.put("‰", "‰");
616 CODED_ENTITIES.put("‹", "‹");
617 CODED_ENTITIES.put("›", "›");
618 CODED_ENTITIES.put("€", "€");
619 }
620
621
622
623
624
625 private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
626
627
628 public String processHtmlEntities(String s) {
629 if (s.indexOf('&')==-1) {
630 return s;
631 }
632 StringBuffer sb = new StringBuffer(s.length());
633 int pos = 0;
634 while (pos<s.length()) {
635 String chunck = s.substring(pos);
636 Matcher m = ENTITIES_PATTERN.matcher(chunck);
637 if (m.find()) {
638 int b = pos + m.start();
639 int e = pos + m.end();
640 if (b>pos) {
641 sb.append(s.substring(pos,b));
642 pos = b;
643 }
644 chunck = s.substring(pos,e);
645 String codedEntity = (String) CODED_ENTITIES.get(chunck);
646 if (codedEntity==null) {
647 codedEntity = chunck;
648 }
649 sb.append(codedEntity);
650 pos = e;
651 }
652 else {
653 sb.append(chunck);
654 pos += chunck.length();
655 }
656 }
657 return sb.toString();
658 }
659
660 }