001 package ca.uhn.hl7v2.util; 002 003 import ca.uhn.hl7v2.parser.*; 004 import ca.uhn.hl7v2.model.Message; 005 import ca.uhn.hl7v2.HL7Exception; 006 import java.util.regex.*; 007 import org.w3c.dom.*; 008 import org.xml.sax.SAXException; 009 import org.xml.sax.InputSource; 010 import org.apache.xml.serialize.*; 011 import org.apache.xerces.parsers.DOMParser; 012 import org.apache.xerces.parsers.StandardParserConfiguration; 013 import java.io.*; 014 015 /** 016 * Tools for testing message strings for semantic equivalence without assuming the correctness 017 * of parsers. 018 * @author Bryan Tripp 019 */ 020 public class EncodedMessageComparator { 021 022 private static GenericParser parser = new GenericParser(); 023 024 /** 025 * Returns a "standardized" equivalent of the given message string. For delimited 026 * messages, the returned value is the shortest string that has an equivalent 027 * meaning in HL7. For XML-encoded messages, the returned value is equivalent XML output 028 * using a standard pretty-print format. An automatic determination is made about whether 029 * the given string is XML or ER7 (i.e. traditionally) encoded. 030 * @param message an XML-encoded or ER7-encoded message string 031 */ 032 public static String standardize(String message) throws SAXException { 033 String result = null; 034 String encoding = parser.getEncoding(message); 035 if (encoding.equals("XML")) { 036 result = standardizeXML(message); 037 } else { 038 result = standardizeER7(message); 039 } 040 return result; 041 } 042 043 /** 044 * Returns the shortest string that is semantically equivalent to a given ER7-encoded 045 * message string. 046 */ 047 public static String standardizeER7(String message) { 048 049 //make delimiter sequences (must quote with \ if not alphanumeric; can't otherwise because of regexp rules) 050 char fieldDelimChar = message.charAt(3); 051 String fieldDelim = String.valueOf(fieldDelimChar); 052 if (!Character.isLetterOrDigit(fieldDelimChar)) fieldDelim = "\\" + fieldDelimChar; 053 054 char compSepChar = message.charAt(4); 055 String compSep = String.valueOf(compSepChar); 056 if (!Character.isLetterOrDigit(compSepChar)) compSep = "\\" + compSepChar; 057 058 char repSepChar = message.charAt(5); 059 String repSep = String.valueOf(repSepChar); 060 if (!Character.isLetterOrDigit(repSepChar)) repSep = "\\" + repSepChar; 061 062 char subSepChar = message.charAt(7); 063 String subSep = String.valueOf(subSepChar); 064 if (!Character.isLetterOrDigit(subSepChar)) subSep = "\\" + subSepChar; 065 066 //char space = ' '; 067 068 /* Things to strip (cumulative): 069 * - all delimiters and repetition separators before end line (i.e. end segment) 070 * - repetition separators, comp and subcomp delims before new field 071 * - subcomponent delimiters before new component 072 */ 073 Pattern endSegment = Pattern.compile("[" + fieldDelim + compSep + repSep + subSep + "]*[\n\r]+"); 074 message = endSegment.matcher(message).replaceAll("\r"); 075 076 Pattern endField = Pattern.compile("[" + repSep + compSep + subSep + "]*" + fieldDelim); 077 message = endField.matcher(message).replaceAll(String.valueOf(fieldDelim)); 078 079 Pattern endComp = Pattern.compile("[" + subSep + "]*" + compSep); 080 message = endComp.matcher(message).replaceAll(String.valueOf(compSep)); 081 082 //Pattern endSub = Pattern.compile("[ ]*" + subSep); 083 //message = endSub.matcher(message).replaceAll(String.valueOf(subSep)); 084 085 //handle special case of subcomp delim in encoding characters 086 message = message.substring(0, 7) + subSepChar + message.substring(7); 087 088 return message; 089 } 090 091 /** 092 * Returns a semantic equivalent of a given XML-encoded message in a default format. 093 * Attributes, comments, and processing instructions are not considered to change the 094 * HL7 meaning of the message, and are removed in the standardized representation. 095 */ 096 public static String standardizeXML(String message) throws SAXException { 097 DOMParser parser = new DOMParser(new StandardParserConfiguration()); 098 parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false); 099 100 Document doc = null; 101 StringWriter out = new StringWriter(); 102 try { 103 synchronized (parser) { 104 parser.parse(new InputSource(new StringReader(message))); 105 doc = parser.getDocument(); 106 } 107 clean(doc.getDocumentElement()); 108 OutputFormat outputFormat = new OutputFormat("", null, true); 109 XMLSerializer ser = new XMLSerializer(out, outputFormat); 110 ser.serialize(doc); 111 } catch (IOException e) { 112 throw new RuntimeException("IOException doing IO to a string!!! " + e.getMessage()); 113 } 114 return out.toString(); 115 } 116 117 /** Removes attributes, comments, and processing instructions. */ 118 private static void clean(Element elem) { 119 NodeList children = elem.getChildNodes(); 120 for (int i = 0; i < children.getLength(); i++) { 121 Node child = children.item(i); 122 if (child.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE 123 || child.getNodeType() == Node.COMMENT_NODE) 124 { 125 elem.removeChild(child); 126 } else if (child.getNodeType() == Node.ELEMENT_NODE) { 127 clean((Element) child); 128 } 129 } 130 131 NamedNodeMap attributes = elem.getAttributes(); 132 //get names 133 String[] names = new String[attributes.getLength()]; 134 for (int i = 0; i < names.length; i++) { 135 names[i] = attributes.item(i).getNodeName(); 136 } 137 //remove by name 138 for (int i = 0; i < names.length; i++) { 139 attributes.removeNamedItem(names[i]); 140 } 141 } 142 143 /** 144 * <p>Compares two HL7 messages to see if they are equivalent (in terms of their 145 * HL7 meaning). Semantically irrelevant differences (e.g. spaces in an XML tag; 146 * extra field delimiters at the end of a segment; XML vs. ER7 encoding; XML attributes) 147 * are ignored. This check is performed without assuming the correctness of the HAPI parsers, 148 * and can therefore be used to test them. This is done by parsing a message, encoding it 149 * again, and comparing the result with this original. </p> 150 * <p>If one message is in XML and the other in ER7, the former is converted to ER7 to 151 * perform the comparison. This process relies on the HAPI parsers. However, the 152 * parsed message is first encoded as XML and compared to the original, so that the 153 * integrity of the parser can be verified. An exception is thrown if this comparison 154 * is unsuccessful. </p> 155 * @return true if given messages are semantically equivalent 156 */ 157 public static boolean equivalent(String message1, String message2) throws HL7Exception { 158 String encoding1 = parser.getEncoding(message1); 159 String encoding2 = parser.getEncoding(message2); 160 161 if (!encoding1.equals(encoding2)) { 162 if (encoding1.equals("XML")) { 163 message1 = safeER7Conversion(message1); 164 } else { 165 message2 = safeER7Conversion(message2); 166 } 167 } 168 169 String std1, std2; 170 try { 171 std1 = standardize(message1); 172 std2 = standardize(message2); 173 } catch (SAXException e) { 174 throw new HL7Exception("Equivalence check failed due to SAXException: " + e.getMessage()); 175 } 176 177 return std1.equals(std2); 178 } 179 180 /** 181 * Converts XML message to ER7, first checking integrity of parse and throwing 182 * an exception if parse not correct 183 */ 184 private static String safeER7Conversion(String xmlMessage) throws HL7Exception { 185 Message m = parser.parse(xmlMessage); 186 187 String check = parser.encode(m, "XML"); 188 if (!equivalent(xmlMessage, check)) { 189 throw new HL7Exception("Parsed and encoded message not equivalent to original (possibilities: invalid message, bug in parser)"); 190 } 191 192 return parser.encode(m, "VB"); 193 } 194 195 /** 196 * Compares messages in two files 197 */ 198 public static void main(String args[]) { 199 200 } 201 202 }