001    package ca.uhn.hl7v2.util;
002    
003    import ca.uhn.hl7v2.parser.*;
004    import ca.uhn.hl7v2.model.Message;
005    import ca.uhn.hl7v2.HL7Exception;
006    import java.util.regex.*;
007    import org.w3c.dom.*;
008    import org.xml.sax.SAXException;
009    import org.xml.sax.InputSource;
010    import org.apache.xml.serialize.*;
011    import org.apache.xerces.parsers.DOMParser;
012    import org.apache.xerces.parsers.StandardParserConfiguration;
013    import java.io.*;
014    
015    /**
016     * Tools for testing message strings for semantic equivalence without assuming the correctness
017     * of parsers.  
018     * @author Bryan Tripp
019     */
020    public class EncodedMessageComparator {
021        
022        private static GenericParser parser = new GenericParser();  
023        
024        /**
025         * Returns a "standardized" equivalent of the given message string.  For delimited
026         * messages, the returned value is the shortest string that has an equivalent
027         * meaning in HL7.  For XML-encoded messages, the returned value is equivalent XML output
028         * using a standard pretty-print format.  An automatic determination is made about whether 
029         * the given string is XML or ER7 (i.e. traditionally) encoded.
030         * @param message an XML-encoded or ER7-encoded message string
031         */
032        public static String standardize(String message) throws SAXException {
033            String result = null;
034            String encoding = parser.getEncoding(message);
035            if (encoding.equals("XML")) {
036                result = standardizeXML(message);
037            } else {
038                result = standardizeER7(message);
039            }
040            return result;
041        }
042        
043        /**
044         * Returns the shortest string that is semantically equivalent to a given ER7-encoded 
045         * message string.
046         */
047        public static String standardizeER7(String message) {
048            
049            //make delimiter sequences (must quote with \ if not alphanumeric; can't otherwise because of regexp rules)
050            char fieldDelimChar = message.charAt(3);
051            String fieldDelim = String.valueOf(fieldDelimChar);
052            if (!Character.isLetterOrDigit(fieldDelimChar)) fieldDelim = "\\" + fieldDelimChar;
053            
054            char compSepChar = message.charAt(4);
055            String compSep = String.valueOf(compSepChar);
056            if (!Character.isLetterOrDigit(compSepChar)) compSep = "\\" + compSepChar;
057            
058            char repSepChar = message.charAt(5);
059            String repSep = String.valueOf(repSepChar);
060            if (!Character.isLetterOrDigit(repSepChar)) repSep = "\\" + repSepChar;
061            
062            char subSepChar = message.charAt(7);
063            String subSep = String.valueOf(subSepChar);
064            if (!Character.isLetterOrDigit(subSepChar)) subSep = "\\" + subSepChar;
065            
066            //char space = ' ';
067            
068            /* Things to strip (cumulative):
069             *  - all delimiters and repetition separators before end line (i.e. end segment)
070             *  - repetition separators, comp and subcomp delims before new field
071             *  - subcomponent delimiters before new component
072             */
073            Pattern endSegment = Pattern.compile("[" + fieldDelim + compSep + repSep + subSep + "]*[\n\r]+");
074            message = endSegment.matcher(message).replaceAll("\r");
075            
076            Pattern endField = Pattern.compile("[" + repSep + compSep + subSep + "]*" + fieldDelim);
077            message = endField.matcher(message).replaceAll(String.valueOf(fieldDelim));
078            
079            Pattern endComp = Pattern.compile("[" + subSep + "]*" + compSep);
080            message = endComp.matcher(message).replaceAll(String.valueOf(compSep));
081            
082            //Pattern endSub = Pattern.compile("[ ]*" + subSep);
083            //message = endSub.matcher(message).replaceAll(String.valueOf(subSep));
084            
085            //handle special case of subcomp delim in encoding characters
086            message = message.substring(0, 7) + subSepChar + message.substring(7);
087            
088            return message;
089        }
090        
091        /**
092         * Returns a semantic equivalent of a given XML-encoded message in a default format.
093         * Attributes, comments, and processing instructions are not considered to change the 
094         * HL7 meaning of the message, and are removed in the standardized representation.    
095         */
096        public static String standardizeXML(String message) throws SAXException {
097            DOMParser parser = new DOMParser(new StandardParserConfiguration());
098            parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
099            
100            Document doc = null;
101            StringWriter out = new StringWriter();
102            try {
103                synchronized (parser) {
104                    parser.parse(new InputSource(new StringReader(message)));
105                    doc = parser.getDocument();
106                }
107                clean(doc.getDocumentElement());
108                OutputFormat outputFormat = new OutputFormat("", null, true);
109                XMLSerializer ser = new XMLSerializer(out, outputFormat);            
110                ser.serialize(doc);
111            } catch (IOException e) {
112                throw new RuntimeException("IOException doing IO to a string!!! " + e.getMessage());
113            }
114            return out.toString();
115        }
116        
117        /** Removes attributes, comments, and processing instructions. */
118        private static void clean(Element elem) {
119            NodeList children = elem.getChildNodes();        
120            for (int i = 0; i < children.getLength(); i++) {
121                Node child = children.item(i);
122                if (child.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE 
123                    || child.getNodeType() == Node.COMMENT_NODE)
124                {
125                    elem.removeChild(child);
126                } else if (child.getNodeType() == Node.ELEMENT_NODE) {
127                    clean((Element) child);
128                }
129            }
130            
131            NamedNodeMap attributes = elem.getAttributes();
132            //get names
133            String[] names = new String[attributes.getLength()];
134            for (int i = 0; i < names.length; i++) {
135                names[i] = attributes.item(i).getNodeName();
136            }
137            //remove by name
138            for (int i = 0; i < names.length; i++) {
139                attributes.removeNamedItem(names[i]);
140            }
141        }
142        
143        /**
144         * <p>Compares two HL7 messages to see if they are equivalent (in terms of their  
145         * HL7 meaning).  Semantically irrelevant differences (e.g. spaces in an XML tag; 
146         * extra field delimiters at the end of a segment; XML vs. ER7 encoding; XML attributes)
147         * are ignored. This check is performed without assuming the correctness of the HAPI parsers, 
148         * and can therefore be used to test them.  This is done by parsing a message, encoding it
149         * again, and comparing the result with this original.  </p>
150         * <p>If one message is in XML and the other in ER7, the former is converted to ER7 to 
151         * perform the comparison.  This process relies on the HAPI parsers.  However, the 
152         * parsed message is first encoded as XML and compared to the original, so that the 
153         * integrity of the parser can be verified.  An exception is thrown if this comparison 
154         * is unsuccessful.  </p>
155         * @return true if given messages are semantically equivalent 
156         */
157        public static boolean equivalent(String message1, String message2) throws HL7Exception {
158            String encoding1 = parser.getEncoding(message1);
159            String encoding2 = parser.getEncoding(message2);
160            
161            if (!encoding1.equals(encoding2)) {
162                if (encoding1.equals("XML")) {
163                    message1 = safeER7Conversion(message1);
164                } else {
165                    message2 = safeER7Conversion(message2);
166                }
167            }
168            
169            String std1, std2;
170            try {
171                std1 = standardize(message1);
172                std2 = standardize(message2);
173            } catch (SAXException e) {
174                throw new HL7Exception("Equivalence check failed due to SAXException: " + e.getMessage());
175            }
176            
177            return std1.equals(std2);
178        }
179        
180        /** 
181         * Converts XML message to ER7, first checking integrity of parse and throwing 
182         * an exception if parse not correct
183         */
184        private static String safeER7Conversion(String xmlMessage) throws HL7Exception {
185            Message m = parser.parse(xmlMessage);
186    
187            String check = parser.encode(m, "XML");
188            if (!equivalent(xmlMessage, check)) {
189                throw new HL7Exception("Parsed and encoded message not equivalent to original (possibilities: invalid message, bug in parser)");
190            }
191            
192            return parser.encode(m, "VB");        
193        }
194        
195        /** 
196         * Compares messages in two files
197         */
198        public static void main(String args[]) {
199            
200        }
201        
202    }