001    package com.mockrunner.util.web;
002    
003    import java.io.StringReader;
004    import java.io.StringWriter;
005    import java.util.List;
006    
007    import org.apache.commons.logging.Log;
008    import org.apache.commons.logging.LogFactory;
009    import org.apache.xerces.parsers.DOMParser;
010    import org.cyberneko.html.HTMLConfiguration;
011    import org.jdom.Element;
012    import org.jdom.input.DOMBuilder;
013    import org.jdom.output.XMLOutputter;
014    import org.xml.sax.InputSource;
015    
016    import com.mockrunner.base.NestedApplicationException;
017    
018    /**
019     * Util class for HTML and XML parsing.
020     */
021    public class XmlUtil
022    {
023        private final static Log log = LogFactory.getLog(XmlUtil.class);
024        
025        /**
026         * Convinience method for HTML fragments. Returns the body
027         * as JDOM <code>Element</code>.
028         * 
029         * If an HTML documents looks like this:
030         * <pre>
031         * <html>
032         * <head>
033         * </head>
034         * <body>
035         * <h1>
036         * </h1>
037         * </body>
038         * </html>
039         * </pre>
040         * 
041         * the method returns the h1 tag as <code>Element</code>.
042         * @param document the <code>org.jdom.Document</code>
043         * @return the body <code>Element</code>
044         */
045        public static Element getBodyFragmentFromJDOMDocument(org.jdom.Document document)
046        {
047            Element element = document.getRootElement().getChild("BODY");
048            if(null == element)
049            {
050                element = document.getRootElement().getChild("body");
051            }
052            if(null != element)
053            {
054                List childs = element.getChildren();
055                if(null != childs && childs.size() > 0) return (Element)childs.get(0);
056            }
057            return null;
058        }
059        
060        /**
061         * @deprecated use {@link #getBodyFragmentFromJDOMDocument}
062         */
063        public static Element getBodyFragmentJDOMDocument(org.jdom.Document document)
064        {
065            return getBodyFragmentFromJDOMDocument(document);
066        }
067        
068        /**
069         * Returns the documents XML content as a string.
070         * @param document the <code>org.jdom.Document</code>
071         * @return the output as string
072         */
073        public static String createStringFromJDOMDocument(org.jdom.Document document)
074        {
075            try
076            {
077                XMLOutputter outputter = new XMLOutputter();
078                StringWriter writer = new StringWriter();
079                outputter.output(document, writer);
080                writer.flush();
081                return writer.toString();
082            }
083            catch(Exception exc)
084            {
085                log.error(exc.getMessage(), exc);
086                throw new NestedApplicationException(exc);
087            }
088        }
089        
090        /**
091         * Creates a JDOM <code>Document</code> from a specified
092         * W3C <code>Document</code>.
093         * @param document the <code>org.w3c.dom.Document</code>
094         * @return the <code>org.jdom.Document</code>
095         */
096        public static org.jdom.Document createJDOMDocument(org.w3c.dom.Document document)
097        {
098            return new DOMBuilder().build(document);
099        }
100        
101        /**
102         * Returns a parser suitable for parsing HTML documents.
103         * The NekoHTML parser is used with some settings to
104         * preserve case of tag names and disable namespace processing. 
105         * This method is used by {@link #parseHTML}.
106         * @return instance of <code>org.apache.xerces.parsers.DOMParser</code>
107         *         with Neko configuration
108         */
109        public static DOMParser getHTMLParser()
110        {
111            try
112            {
113                HTMLConfiguration config = new HTMLConfiguration();
114                config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
115                config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
116                DOMParser parser = new DOMParser(config);
117                return parser;
118            }
119            catch(Exception exc)
120            {
121                log.error(exc.getMessage(), exc);
122                throw new NestedApplicationException(exc);
123            }
124        }
125        
126        /**
127         * Parses the specified HTML with the NekoHTML parser.
128         * If you want to use another HTML parser or configure
129         * the NekoHTML parser with special features, you can use
130         * the <code>parse</code> method.
131         * @param source the HTML as String
132         * @return the parsed document as org.w3c.dom.Document
133         */
134        public static org.w3c.dom.Document parseHTML(String source)
135        {
136            try
137            {
138                return parse(getHTMLParser(), source);
139            }
140            catch(Exception exc)
141            {
142                log.error(exc.getMessage(), exc);
143                throw new NestedApplicationException(exc);
144            }
145        }
146        
147        /**
148         * Parses the specified XML with the specified parser.
149         * The main purpose of this method is to use the NekoHTML 
150         * parser with custom features and properties. If you can live
151         * with the settings provided by Mockrunner, you can use 
152         * {@link #parseHTML}.
153         * @param parser the parser (must extend 
154         *               <code>org.apache.xerces.parsers.DOMParser</code>), 
155         *               e.g. the one returned by {@link #getHTMLParser}
156         * @param source the XML as String
157         * @return the parsed document as org.w3c.dom.Document
158         */
159        public static org.w3c.dom.Document parse(DOMParser parser, String source)
160        {
161            try
162            {
163                parser.parse(new InputSource(new StringReader(source)));
164                return parser.getDocument();
165            }
166            catch(Exception exc)
167            {
168                log.error(exc.getMessage(), exc);
169                throw new NestedApplicationException(exc);
170            }
171        }
172    }