View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io.impl;
18  
19  import java.util.ArrayList;
20  import java.util.Iterator;
21  import java.util.List;
22  
23  import org.jdom.Document;
24  import org.jdom.Element;
25  import org.jdom.Namespace;
26  import org.jdom.output.XMLOutputter;
27  
28  import com.sun.syndication.feed.WireFeed;
29  import com.sun.syndication.feed.atom.Category;
30  import com.sun.syndication.feed.atom.Content;
31  import com.sun.syndication.feed.atom.Entry;
32  import com.sun.syndication.feed.atom.Feed;
33  import com.sun.syndication.feed.atom.Generator;
34  import com.sun.syndication.feed.atom.Link;
35  import com.sun.syndication.feed.atom.Person;
36  import com.sun.syndication.io.FeedException;
37  import java.net.MalformedURLException;
38  import java.net.URL;
39  import java.util.regex.Pattern;
40  import org.jdom.Attribute;
41  import org.jdom.Parent;
42  
43  /**
44   * Parser for Atom 1.0
45   * @author Dave Johnson
46   */
47  public class Atom10Parser extends BaseWireFeedParser {
48      private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
49      Namespace ns = Namespace.getNamespace(ATOM_10_URI);
50      
51      public Atom10Parser() {
52          this("atom_1.0");
53      }
54      
55      protected Atom10Parser(String type) {
56          super(type);
57      }
58      
59      protected Namespace getAtomNamespace() {
60          return ns;
61      }
62      
63      public boolean isMyType(Document document) {
64          Element rssRoot = document.getRootElement();
65          Namespace defaultNS = rssRoot.getNamespace();
66          return (defaultNS!=null) && defaultNS.equals(getAtomNamespace());
67      }
68      
69      public WireFeed parse(Document document, boolean validate)
70      throws IllegalArgumentException,FeedException {
71          if (validate) {
72              validateFeed(document);
73          }
74          Element rssRoot = document.getRootElement();
75          return parseFeed(rssRoot);
76      }
77      
78      protected void validateFeed(Document document) throws FeedException {
79          // TBD
80          // here we have to validate the Feed against a schema or whatever
81          // not sure how to do it
82          // one posibility would be to produce an ouput and attempt to parse it again
83          // with validation turned on.
84          // otherwise will have to check the document elements by hand.
85      }
86      
87      protected WireFeed parseFeed(Element eFeed) throws FeedException {
88          
89          com.sun.syndication.feed.atom.Feed feed =
90                  new com.sun.syndication.feed.atom.Feed(getType());
91          
92          String baseURI = null;
93          try {
94              baseURI = findBaseURI(eFeed);
95          } catch (Exception e) {
96              throw new FeedException("ERROR while finding base URI of feed", e);
97          }
98          
99          String xmlBase = eFeed.getAttributeValue("base", Namespace.XML_NAMESPACE);
100         if (xmlBase != null) {
101             feed.setXmlBase(xmlBase);
102         }
103         
104         Element e = eFeed.getChild("title",getAtomNamespace());
105         if (e!=null) {
106             Content c = new Content();
107             c.setValue(parseTextConstructToString(e));
108             c.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
109             feed.setTitleEx(c);
110         }
111         
112         List eList = eFeed.getChildren("link",getAtomNamespace());
113         feed.setAlternateLinks(parseAlternateLinks(feed, null, baseURI, eList));
114         feed.setOtherLinks(parseOtherLinks(feed, null, baseURI, eList));
115         
116         List cList = eFeed.getChildren("category",getAtomNamespace());
117         feed.setCategories(parseCategories(baseURI, cList));
118         
119         eList = eFeed.getChildren("author", getAtomNamespace());
120         if (eList.size()>0) {
121             feed.setAuthors(parsePersons(baseURI, eList));
122         }
123         
124         eList = eFeed.getChildren("contributor",getAtomNamespace());
125         if (eList.size()>0) {
126             feed.setContributors(parsePersons(baseURI, eList));
127         }
128         
129         e = eFeed.getChild("subtitle",getAtomNamespace());
130         if (e!=null) {
131             Content subtitle = new Content();
132             subtitle.setValue(parseTextConstructToString(e));
133             subtitle.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
134             feed.setSubtitle(subtitle);
135         }
136         
137         e = eFeed.getChild("id",getAtomNamespace());
138         if (e!=null) {
139             feed.setId(e.getText());
140         }
141         
142         e = eFeed.getChild("generator",getAtomNamespace());
143         if (e!=null) {
144             Generator gen = new Generator();
145             gen.setValue(e.getText());
146             String att = e.getAttributeValue("uri");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
147             if (att!=null) {
148                 gen.setUrl(att);
149             }
150             att = e.getAttributeValue("version");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
151             if (att!=null) {
152                 gen.setVersion(att);
153             }
154             feed.setGenerator(gen);
155         }
156         
157         e = eFeed.getChild("rights",getAtomNamespace());
158         if (e!=null) {
159             feed.setRights(parseTextConstructToString(e));
160         }
161         
162         e = eFeed.getChild("icon",getAtomNamespace());
163         if (e!=null) {
164             feed.setIcon(e.getText());
165         }
166         
167         e = eFeed.getChild("logo",getAtomNamespace());
168         if (e!=null) {
169             feed.setLogo(e.getText());
170         }
171         
172         e = eFeed.getChild("updated",getAtomNamespace());
173         if (e!=null) {
174             feed.setUpdated(DateParser.parseDate(e.getText()));
175         }
176         
177         feed.setModules(parseFeedModules(eFeed));
178         
179         eList = eFeed.getChildren("entry",getAtomNamespace());
180         if (eList.size()>0) {
181             feed.setEntries(parseEntries(feed, baseURI, eList));
182         }
183         
184         List foreignMarkup =
185                 extractForeignMarkup(eFeed, feed, getAtomNamespace());
186         if (foreignMarkup.size() > 0) {
187             feed.setForeignMarkup(foreignMarkup);
188         }
189         return feed;
190     }
191     
192     private Link parseLink(Feed feed , Entry entry, String baseURI, Element eLink) {
193         Link link = new Link();
194         String att = eLink.getAttributeValue("rel");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
195         if (att!=null) {
196             link.setRel(att);
197         }
198         att = eLink.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
199         if (att!=null) {
200             link.setType(att);
201         }
202         att = eLink.getAttributeValue("href");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
203         if (att!=null) {
204             if (isRelativeURI(att)) {
205                 link.setHref(resolveURI(baseURI, eLink, att));
206             } else {
207                 link.setHref(att);
208             }
209         }
210         att = eLink.getAttributeValue("title");
211         if (att!=null) {
212             link.setTitle(att);
213         }
214         att = eLink.getAttributeValue("hreflang");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
215         if (att!=null) {
216             link.setHreflang(att);
217         }
218         att = eLink.getAttributeValue("length");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
219         if (att!=null) {
220             link.setLength(Long.parseLong(att));
221         }
222         return link;
223     }
224     
225     // List(Elements) -> List(Link)
226     private List parseAlternateLinks(Feed feed, Entry entry, String baseURI, List eLinks) {
227         List links = new ArrayList();
228         for (int i=0;i<eLinks.size();i++) {
229             Element eLink = (Element) eLinks.get(i);
230             Link link = parseLink(feed, entry, baseURI, eLink);
231             if (link.getRel() == null
232                     || "".equals(link.getRel().trim())
233                     || "alternate".equals(link.getRel())) {
234                 links.add(link);
235             }
236         }
237         return (links.size()>0) ? links : null;
238     }
239     
240     private List parseOtherLinks(Feed feed, Entry entry, String baseURI, List eLinks) {
241         List links = new ArrayList();
242         for (int i=0;i<eLinks.size();i++) {
243             Element eLink = (Element) eLinks.get(i);
244             Link link = parseLink(feed, entry, baseURI, eLink);
245             if (!"alternate".equals(link.getRel())) {
246                 links.add(link);
247             }
248         }
249         return (links.size()>0) ? links : null;
250     }
251     
252     private Person parsePerson(String baseURI, Element ePerson) {
253         Person person = new Person();
254         Element e = ePerson.getChild("name",getAtomNamespace());
255         if (e!=null) {
256             person.setName(e.getText());
257         }
258         e = ePerson.getChild("uri",getAtomNamespace());
259         if (e!=null) {
260             person.setUri(resolveURI(baseURI, ePerson, e.getText()));
261         }
262         e = ePerson.getChild("email",getAtomNamespace());
263         if (e!=null) {
264             person.setEmail(e.getText());
265         }
266         return person;
267     }
268     
269     // List(Elements) -> List(Persons)
270     private List parsePersons(String baseURI, List ePersons) {
271         List persons = new ArrayList();
272         for (int i=0;i<ePersons.size();i++) {
273             persons.add(parsePerson(baseURI, (Element)ePersons.get(i)));
274         }
275         return (persons.size()>0) ? persons : null;
276     }
277     
278     private Content parseContent(Element e) {
279         String value = parseTextConstructToString(e);
280         String src = e.getAttributeValue("src");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
281         String type = e.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
282         Content content = new Content();
283         content.setSrc(src);
284         content.setType(type);
285         content.setValue(value);
286         return content;
287     }
288     
289     private String parseTextConstructToString(Element e) {
290         String value = null;
291         String type = e.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
292         type = (type!=null) ? type : Content.TEXT;
293         if (type.equals(Content.XHTML)) {
294             // XHTML content needs special handling
295             XMLOutputter outputter = new XMLOutputter();
296             List eContent = e.getContent();
297             Iterator i = eContent.iterator();
298             while (i.hasNext()) {
299                 org.jdom.Content c = (org.jdom.Content) i.next();
300                 if (c instanceof Element) {
301                     Element eC = (Element) c;
302                     if (eC.getNamespace().equals(getAtomNamespace())) {
303                         ((Element)c).setNamespace(Namespace.NO_NAMESPACE);
304                     }
305                 }
306             }
307             value = outputter.outputString(eContent);
308         } else {
309             // Everything else comes in verbatim
310             value = e.getText();
311         }
312         return value;
313     }
314     
315     // List(Elements) -> List(Entries)
316     protected List parseEntries(Feed feed, String baseURI, List eEntries) {
317         List entries = new ArrayList();
318         for (int i=0;i<eEntries.size();i++) {
319             entries.add(parseEntry(feed, (Element)eEntries.get(i), baseURI));
320         }
321         return (entries.size()>0) ? entries : null;
322     }
323     
324     protected Entry parseEntry(Feed feed, Element eEntry, String baseURI) {
325         Entry entry = new Entry();
326         
327         String xmlBase = eEntry.getAttributeValue("base", Namespace.XML_NAMESPACE);
328         if (xmlBase != null) {
329             entry.setXmlBase(xmlBase);
330         }
331         
332         Element e = eEntry.getChild("title",getAtomNamespace());
333         if (e!=null) {
334             Content c = new Content();
335             c.setValue(parseTextConstructToString(e));
336             c.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
337             entry.setTitleEx(c);
338         }
339         
340         List eList = eEntry.getChildren("link",getAtomNamespace());
341         entry.setAlternateLinks(parseAlternateLinks(feed, entry, baseURI, eList));
342         entry.setOtherLinks(parseOtherLinks(feed, entry, baseURI, eList));
343         
344         eList = eEntry.getChildren("author", getAtomNamespace());
345         if (eList.size()>0) {
346             entry.setAuthors(parsePersons(baseURI, eList));
347         }
348         
349         eList = eEntry.getChildren("contributor",getAtomNamespace());
350         if (eList.size()>0) {
351             entry.setContributors(parsePersons(baseURI, eList));
352         }
353         
354         e = eEntry.getChild("id",getAtomNamespace());
355         if (e!=null) {
356             entry.setId(e.getText());
357         }
358         
359         e = eEntry.getChild("updated",getAtomNamespace());
360         if (e!=null) {
361             entry.setUpdated(DateParser.parseW3CDateTime(e.getText()));
362         }
363         
364         e = eEntry.getChild("published",getAtomNamespace());
365         if (e!=null) {
366             entry.setPublished(DateParser.parseW3CDateTime(e.getText()));
367         }
368         
369         e = eEntry.getChild("summary",getAtomNamespace());
370         if (e!=null) {
371             entry.setSummary(parseContent(e));
372         }
373         
374         e = eEntry.getChild("content",getAtomNamespace());
375         if (e!=null) {
376             List contents = new ArrayList();
377             contents.add(parseContent(e));
378             entry.setContents(contents);
379         }
380         
381         e = eEntry.getChild("rights",getAtomNamespace());
382         if (e!=null) {
383             entry.setRights(e.getText());
384         }
385         
386         List cList = eEntry.getChildren("category",getAtomNamespace());
387         entry.setCategories(parseCategories(baseURI, cList));
388         
389         // TODO: SHOULD handle Atom entry source element
390         
391         entry.setModules(parseItemModules(eEntry));
392         
393         List foreignMarkup =
394                 extractForeignMarkup(eEntry, entry, getAtomNamespace());
395         if (foreignMarkup.size() > 0) {
396             entry.setForeignMarkup(foreignMarkup);
397         }
398         return entry;
399     }
400     
401     private List parseCategories(String baseURI, List eCategories) {
402         List cats = new ArrayList();
403         for (int i=0;i<eCategories.size();i++) {
404             Element eCategory = (Element) eCategories.get(i);
405             cats.add(parseCategory(baseURI, eCategory));
406         }
407         return (cats.size()>0) ? cats : null;
408     }
409     
410     private Category parseCategory(String baseURI, Element eCategory) {
411         Category category = new Category();
412         String att = eCategory.getAttributeValue("term");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
413         if (att!=null) {
414             category.setTerm(att);
415         }
416         att = eCategory.getAttributeValue("scheme");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
417         if (att!=null) {
418             category.setScheme(resolveURI(baseURI, eCategory, att));
419         }
420         att = eCategory.getAttributeValue("label");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
421         if (att!=null) {
422             category.setLabel(att);
423         }
424         return category;
425         
426     }
427     
428     
429     // Fix for issue #34 "valid IRI href attributes are stripped for atom:link"
430     // URI's that didn't start with http were being treated as relative URIs.
431     // So now consider an absolute URI to be any alpha-numeric string followed
432     // by a colon, followed by anything -- specified by this regex:
433     static Pattern absoluteURIPattern = Pattern.compile("^[a-z0-9]*:.*$");
434     
435     private boolean isAbsoluteURI(String uri) {
436         return absoluteURIPattern.matcher(uri).find();
437     }
438     
439     private boolean isRelativeURI(String uri) {
440         return !isAbsoluteURI(uri);
441     }
442         
443     /**
444      * } 
445      * Resolve URI based considering xml:base and baseURI.
446      * @param baseURI Base URI of feed
447      * @param parent  Parent from which to consider xml:base
448      * @param url     URL to be resolved
449      */
450     private String resolveURI(String baseURI, Parent parent, String url) {
451         if (isRelativeURI(url)) {
452             url = (!".".equals(url) && !"./".equals(url)) ? url : "";
453 
454             // Relative URI with parent
455             if (parent != null && parent instanceof Element) {
456 
457                 // Do we have an xml:base?         
458                 String xmlbase = ((Element)parent).getAttributeValue(
459                     "base", Namespace.XML_NAMESPACE);
460                 if (xmlbase != null && xmlbase.trim().length() > 0) {
461                     if (isAbsoluteURI(xmlbase)) {
462                         // Absolute xml:base, so form URI right now 
463                         if (url.startsWith("/")) { 
464                             // Host relative URI
465                             int slashslash = xmlbase.indexOf("//");
466                             int nextslash = xmlbase.indexOf("/", slashslash + 2);
467                             if (nextslash != -1) xmlbase = xmlbase.substring(0, nextslash);
468                             return formURI(xmlbase, url); 
469                         }
470                         if (!xmlbase.endsWith("/")) {
471                             // Base URI is filename, strip it off 
472                             xmlbase = xmlbase.substring(0, xmlbase.lastIndexOf("/"));
473                         }
474                         return formURI(xmlbase, url);
475                     } else {
476                         // Relative xml:base, so walk up tree
477                         return resolveURI(baseURI, parent.getParent(), 
478                             stripTrailingSlash(xmlbase) + "/"+ stripStartingSlash(url));
479                     }
480                 }
481                 // No xml:base so walk up tree
482                 return resolveURI(baseURI, parent.getParent(), url);
483 
484             // Relative URI with no parent (i.e. top of tree), so form URI right now
485             } else if (parent == null || parent instanceof Document) {
486                 return formURI(baseURI, url);        
487             } 
488         }                
489         return url;
490     }
491         
492     /**
493      * Find base URI of feed considering relative URIs.
494      * @param root Root element of feed.
495      */
496     private String findBaseURI(Element root) throws MalformedURLException {
497         String ret = findAtomLink(root, "alternate");
498         if (ret != null && isRelativeURI(ret)) {
499             String self = findAtomLink(root, "self");
500             if (self != null) {
501                 self = resolveURI(null, root, self);
502                 self = self.substring(0, self.lastIndexOf("/"));
503                 ret = resolveURI(self, root, ret);
504             }
505         }
506         return ret;
507     } 
508     
509     /** 
510      * Return URL string of Atom link element under parent element.
511      * Link with no rel attribute is considered to be rel="alternate"
512      * @param parent Consider only children of this parent element
513      * @param rel    Consider only links with this relationship
514      */
515     private String findAtomLink(Element parent, String rel) {
516         String ret = null;
517         List linksList = parent.getChildren("link", ns);
518         if (linksList != null) {
519             for (Iterator links = linksList.iterator(); links.hasNext(); ) {
520                 Element link = (Element)links.next();
521                 Attribute relAtt = link.getAttribute("rel");
522                 Attribute hrefAtt = link.getAttribute("href");
523                 if (   (relAtt == null && "alternate".equals(rel)) 
524                     || (relAtt != null && relAtt.getValue().equals(rel))) {
525                     ret = hrefAtt.getValue();
526                     break;
527                 }
528             }
529         }
530         return ret;
531     }
532         
533     /** 
534      * Form URI by combining base with append portion and giving 
535      * special consideration to append portions that begin with ".."
536      * @param base   Base of URI, may end with trailing slash
537      * @param append String to append, may begin with slash or ".."
538      */
539     private static String formURI(String base, String append) {
540         base = stripTrailingSlash(base);
541         append = stripStartingSlash(append);
542         if (append.startsWith("..")) {
543             String ret = null;
544             String[] parts = append.split("/");
545             for (int i=0; i<parts.length; i++) {
546                 if ("..".equals(parts[i])) {
547                     int last = base.lastIndexOf("/");
548                     if (last != -1) {
549                         base = base.substring(0, last);
550                         append = append.substring(3, append.length());
551                     }
552                     else break;
553                 }
554             }
555         }
556         return base + "/" + append;
557     }
558     
559     /** 
560      * Strip starting slash from beginning of string.
561      */
562     private static String stripStartingSlash(String s) {
563         if (s != null && s.startsWith("/")) {
564             s = s.substring(1, s.length());
565         }
566         return s;
567     }
568     
569     /** 
570      * Strip trailing slash from end of string.
571      */
572     private static String stripTrailingSlash(String s) {
573         if (s != null && s.endsWith("/")) {
574             s = s.substring(0, s.length() - 1);
575         }
576         return s;
577     }
578 }