001 package ca.uhn.hl7v2.preparser; 002 003 import java.util.*; 004 import java.io.*; 005 import javax.xml.parsers.*; 006 import org.xml.sax.*; 007 import org.xml.sax.helpers.*; 008 009 import ca.uhn.hl7v2.HL7Exception; 010 011 public class XML 012 { 013 protected static class StopParsingException extends SAXException 014 { 015 public StopParsingException() 016 { 017 super("ca.uhn.hl7.....StopParsingException"); 018 } 019 } 020 021 /** the SAXParser reports parsing events to an object of this class. 022 We keep track of some parsing state, and the Properties object that 023 we're supposed to write our data to. 024 */ 025 static protected class HL7MessageHandler extends DefaultHandler 026 { 027 /* m_props & m_msgMask should be set by the user of this handler before 028 they pass this handler to SAXParser.parse() or whatever */ 029 030 /** The data that is found while parsing, and which passes m_msgMask, 031 will be dumped to m_props, as (DatumPath.toString() / text) key/value 032 pairs */ 033 public Properties m_props = null; 034 035 /** Specifies what parts of a message should be dumped to m_props. 036 */ 037 public Collection m_msgMask = null; 038 039 /* All other fields are parser state. */ 040 041 protected boolean m_startedDocument = false; 042 043 /* m_msgID / m_curPath together keep track of where we are in the document. 044 045 If m_msgID.length() != 0, then we're within the message element. (We're only 046 expecting one message per document.) Then m_msgID will be the name of the 047 message. ("ACK" or whatever). 048 049 m_curPath keeps track of where within the message we are. See notes at 050 DatumPath class definition. If m_curPath.size() != 0, then we must be 051 within a message. 052 053 At any point in the code below: 054 055 if m_msgID.length() == 0, 056 then m_curPath().size() == 0 057 058 if m_curPath.length() != 0 059 then m_msgID.length() != 0 060 061 Note that our DatumPaths count indices starting from 0 (not 1) -- they're 062 only converted to 1-based in the string representations that wind up 063 as m_props keys. 064 */ 065 StringBuffer m_msgID = new StringBuffer(); 066 DatumPath m_curPath = new DatumPath(); 067 068 /* the location in the document of the last datum we dumped to m_props. */ 069 DatumPath m_lastDumpedPath = new DatumPath(); 070 071 /** For handling repeat segments. segmentID (String) -> next repeat idx 072 (Integer). So when we hit a segment ZYX, we'll know how many times we've 073 hit a ZYX before, and set the segmentRepIdx part of m_curPath 074 appropriately. */ 075 TreeMap m_segmentId2nextRepIdx = new TreeMap(); 076 077 /* m_depthWithinUselessElement and m_depthWithinUsefulElement 078 reflect what m_msgMask thinks about our location in the document at any 079 given time. 080 081 Both should always be >= -1. Note that both can be >= 0 at the same time 082 -- explained in a minute.... 083 084 If m_depthWithinUsefulElement >= 0, this means that we are however deep 085 (in terms of nested elements: 0 => just within) within an area of the 086 message that passes m_msgMask. We should should dump whatever we find 087 there to m_props. As we move around within such an element, we will still 088 update m_curPath appropriately. 089 090 If m_depthWithinUsefulElement >= 0, we are however deep within an element 091 which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1> 092 -- a few other things maybe), or more importantly that we're within an 093 element that otherwise has no hope of having any useful elements within it 094 according to m_msgMask. (eg. m_msgMask says it wants only ZYX segment 095 contents, we're in an <MSH>). So we can safely ignore all content within, 096 and just keep track of how deep we are within this useless element (with 097 m_depthWithinUselessElement, of course.) We don't update m_curPath when 098 m_depthWithinUselessElement >= 0, there's no point and how would we 099 extract information for the DatumPath out of nonsensical element names 100 anyway. 101 102 If they are both >= 0, this means that there we've found some useless 103 stuff (nonsensical element names?) within a known-useful element. 104 */ 105 int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1; 106 107 /* With this we keep the text that we've found within a certain element. 108 It's cleared whenever we enter a (sub) element or leave an element. */ 109 StringBuffer m_chars = new StringBuffer(10); 110 111 public HL7MessageHandler() 112 { 113 this.clear(); 114 } 115 116 void clear() 117 { 118 // reset the state (m_props & m_msgMask are not state) 119 m_startedDocument = false; 120 m_msgID.delete(0, m_msgID.length()); 121 m_curPath.clear(); 122 // will always be "less than" (according to DatumPath.numbersLessThan) 123 // any sensible DatumPath: 124 m_lastDumpedPath.clear().add(new String()).add(-42).add(-42).add(-42).add(-42).add(-42); 125 m_segmentId2nextRepIdx.clear(); 126 m_depthWithinUsefulElement = -1; 127 m_depthWithinUselessElement = -1; 128 m_chars.delete(0, m_chars.length()); 129 } 130 131 public void startDocument() throws SAXException 132 { 133 boolean ok = false; 134 if(!m_startedDocument && (m_props != null)) { 135 m_startedDocument = true; 136 ok = true; 137 } 138 139 if(!ok) { 140 clear(); 141 throw new StopParsingException(); 142 } 143 } 144 145 public void endDocument() throws SAXException 146 { 147 boolean ok = false; 148 if(m_startedDocument) { 149 this.clear(); 150 ok = true; 151 } 152 153 if(!ok) { 154 clear(); 155 throw new StopParsingException(); 156 } 157 } 158 159 public void startElement(String uri, String localName, String qName, 160 Attributes attributes) throws SAXException 161 { 162 //System.err.println("startelem: " + qName + " curpathsize; " + 163 //m_curPath.size()); 164 boolean ok = false; 165 if(m_startedDocument) { 166 // A single unit of text data will be within a single element, 167 // -- none of it will be in sub-elements and there will be no 168 // sub-elements fragmenting the data text. 169 // Right now we're entering a new element: this means that anything 170 // in m_chars will be whitespace (likely), or text left over from, 171 // say, the last field, or text that was somewhere it shouldn't have been. 172 // (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>" 173 m_chars.delete(0, m_chars.length()); 174 175 if(m_depthWithinUselessElement >= 0) { 176 ++m_depthWithinUselessElement; 177 } 178 else { 179 int oldCurPathSize = m_curPath.size(); 180 if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath, 181 m_segmentId2nextRepIdx, m_lastDumpedPath, qName)) 182 { 183 if(m_curPath.size() > oldCurPathSize) { 184 // assert (m_depthWithinUselessElement == -1) // m_curPath 185 // should not have grown if we're within a useless element. 186 if(m_depthWithinUsefulElement == -1) { 187 // this new element could match one of the DatumPaths in 188 // m_msgMask -- if that's the case, we've just entered a 189 // useful element. 190 // TODO: functional stylee (a la C++'s std::accumulate) ? 191 boolean curPathStartsWithAMaskElem = false; 192 for(Iterator maskIt = m_msgMask.iterator(); 193 !curPathStartsWithAMaskElem && maskIt.hasNext(); ) 194 { 195 curPathStartsWithAMaskElem 196 = m_curPath.startsWith((DatumPath)maskIt.next()); 197 } 198 199 if(curPathStartsWithAMaskElem) 200 m_depthWithinUsefulElement = 0; 201 else { 202 // so this element we're entering is not specified by m_msgMask 203 // to be useful -- but might it contains elements that 204 // are? 205 boolean aMaskElemStartsWithCurPath = false; 206 for(Iterator maskIt = m_msgMask.iterator(); 207 !aMaskElemStartsWithCurPath && maskIt.hasNext(); ) 208 { 209 aMaskElemStartsWithCurPath 210 = ((DatumPath)maskIt.next()).startsWith(m_curPath); 211 } 212 213 if(!aMaskElemStartsWithCurPath) { 214 // ... nope! useless. 215 m_depthWithinUselessElement = 0; 216 m_curPath.setSize(oldCurPathSize); 217 } // else => ok, carry on, m_depthWithinUse{less,ful}Element 218 // still both -1. 219 } 220 } 221 // else => already within a useful element, don't need to compare 222 // against m_msgMask. 223 } 224 } 225 else 226 m_depthWithinUselessElement = 0; 227 } 228 ok = true; 229 } 230 231 if(!ok) { 232 clear(); 233 throw new StopParsingException(); 234 } 235 } 236 237 /* doc location == msgID & curPath together. 238 If we've encountered an element called "elementNam", then this tries 239 to determine what it is, based on what we already know about the document. 240 returns true if we can make sense of this new element name given the 241 position we're at (represented by msgID / curPath), 242 false if we can't (which probably means this should be a useless element). 243 returning true doesn't mean that we actually changed msgID or curPath, it 244 might mean that we just passed through a segment group element OK. 245 */ 246 protected static boolean tryToGrowDocLocationFromElementName( 247 StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/, 248 Map segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/, 249 String elementName /*in*/) 250 { 251 boolean ok = false; // ok == can we make sense of this new element? 252 // hmm ... where are we in the document: 253 if((msgID.length() == 0) && (curPath.size() == 0)) { 254 // we're entering a message 255 msgID.replace(0, msgID.length(), elementName); 256 segmentId2nextRepIdx.clear(); 257 ok = true; 258 } 259 else if((msgID.length() > 0) && (curPath.size() == 0)) { 260 // we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>) 261 // or an actual segment element. 262 if(!(elementName.startsWith("" + msgID + '.'))) { 263 // must be an actual segment. 264 curPath.add(elementName); 265 266 if(segmentId2nextRepIdx.containsKey(elementName)) 267 curPath.add(segmentId2nextRepIdx.get(elementName)); 268 else 269 curPath.add(new Integer(0)); 270 271 segmentId2nextRepIdx.put(elementName, 272 new Integer(((Integer)curPath.get(curPath.size()-1)).intValue() + 1)); 273 } 274 ok = true; 275 } 276 else if((msgID.length() > 0) && (curPath.size() > 0)) { 277 // we're entering a field or a component or a subcomponent. 278 if(curPath.size() == 2) { // we're entering a field element 279 // all fields should start with segment-ID + '.' 280 if(elementName.startsWith("" + curPath.get(0) + '.')) { 281 try { 282 int fieldIdxFromElementName 283 = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1)); 284 285 curPath.add(new Integer(fieldIdxFromElementName)); 286 287 // now add the repetition idx to curPath: 288 if((lastDumpedPath.size() >= 4) 289 && (((Integer)lastDumpedPath.get(2)).intValue() 290 == fieldIdxFromElementName)) 291 { 292 // lastDumpedPath has a fieldIdx and a fieldRepIdx. 293 curPath.add(new Integer(((Integer)lastDumpedPath.get(3)).intValue() + 1)); 294 } 295 else 296 curPath.add(new Integer(0)); 297 298 ok = true; 299 } catch(NumberFormatException e) {} 300 } // else => this isn't a field -- must be useless. 301 } 302 else if((curPath.size() == 4) || (curPath.size() == 5)) { 303 // we're entering a component or subcomponent element 304 try { 305 int idxFromElementName 306 = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1)); 307 curPath.add(new Integer(idxFromElementName)); 308 ok = true; 309 } catch(NumberFormatException e) {} 310 } 311 } 312 return ok; 313 } 314 315 public void endElement(String uri, String localName, String qName) 316 throws SAXException 317 { 318 //System.err.println("endElement: " + qName); 319 boolean ok = false; 320 if(m_startedDocument) { 321 if(m_depthWithinUselessElement >= 0) { 322 --m_depthWithinUselessElement; 323 ok = true; 324 } 325 else { 326 if((m_msgID.length() > 0) && (m_curPath.size() == 0)) { 327 // we're exiting either a message element or a 328 // segment group element. 329 if((""+qName).compareTo(""+m_msgID) == 0) 330 m_msgID.delete(0, m_msgID.length()); // => exiting message element 331 // else => segment group element -- do nothing. 332 333 ok = true; 334 } 335 else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) { 336 tryToDumpDataToProps(); 337 338 if(m_curPath.size() == 2) { 339 // exiting a segment element 340 m_curPath.setSize(0); 341 ok = true; 342 } 343 else if(m_curPath.size() == 4) { 344 // exiting a field element 345 m_curPath.setSize(2); 346 ok = true; 347 } 348 else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) { 349 // exiting a component or a subcomponent 350 m_curPath.setSize(m_curPath.size() - 1); 351 ok = true; 352 } 353 } 354 355 if(m_depthWithinUsefulElement >= 0) 356 --m_depthWithinUsefulElement; 357 } 358 } 359 360 if(!ok) { 361 clear(); 362 throw new StopParsingException(); 363 } 364 } 365 366 /** try to dump whatever we've got in m_chars to m_props, 367 with a key of m_curPath.toString(). 368 */ 369 protected void tryToDumpDataToProps() 370 { 371 if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) { 372 /* m_curPath.toString() will be the property key whose value will be 373 m_chars. 374 375 This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9> 376 <PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something 377 like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element. (note: internal 378 DatumPath elements are 0-indexed, string representations of DatumPaths and 379 the XML text is 1-indexed.) So in m_props the key for "P" would have been 380 "ZYX[0]-9[0]-1-1". (the last "-1" is a default that got added by 381 toString()). 382 383 Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0, 384 9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when 385 exiting the ZYX.9 element, we might have written that whitespace to m_props 386 with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1": 387 the same as the key for the "P" ... clobbering "P" in m_props with 388 whitespace. 389 390 But since we know that HL7 fields / components / etc are always in order 391 (numerically), we can count on m_lastDumpedPath and use 392 DatumPath.numbersLessThan to avoid the clobbering. 393 */ 394 if((m_lastDumpedPath.get(0).equals(m_curPath.get(0))) 395 ? (m_lastDumpedPath.numbersLessThan(m_curPath)) 396 : true) 397 { 398 if(m_depthWithinUsefulElement >= 0) { 399 // TODO: remove! or assert 400 if(m_props.containsKey("" + m_curPath)) 401 System.err.println("ALAAAARM: CLOBBERING PROPERTY in " + getClass()); 402 403 m_props.setProperty("" + m_curPath, "" + m_chars); 404 m_lastDumpedPath.copy(m_curPath); 405 m_chars.delete(0, m_chars.length()); 406 } 407 } 408 } 409 } 410 411 public void characters(char[] chars, int start, int length) 412 { 413 // note that a contiguous run of characters in the document 414 // might get reported to us in several chunks. 415 // (In the order that the text appears in the document, 416 // non-overlapping and with no gaps between chunks.) 417 // An entity like & will reach us as an actual & character. 418 419 if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) { 420 m_chars.append(chars, start, length); 421 } 422 } 423 424 public void ignoreableWhitespace(char []chars, int start, int length) 425 { 426 // it's unclear which whitespace is considered ignorable for us. 427 // what the heck, add it to m_chars. 428 characters(chars, start, length); 429 } 430 431 public void error(SAXParseException e) 432 { 433 // TODO: remove. 434 System.err.println("Error in " + getClass() + ": " + e); 435 } 436 437 public void fatalError(SAXParseException e) throws SAXException 438 { 439 throw e; 440 } 441 } 442 443 /** parse message according to our HL7 XML handler, and dump the data found 444 to props. 445 446 returns true if we parsed ok, which means well-formed XML, and 447 that's about it. We just barely check against HL7 structure, and ignore any 448 elements / text that is unexpected (that is, impossible in any HL7 message: 449 independant of any message / segment definitions). 450 451 "message" should be an XML document with one top-level element -- that being 452 the message. (<ACK> or whatever). We're only expecting one message to be in 453 "message". 454 455 props can be null if you don't want the data (we still parse). The message 456 data found in message (that passes msgMask) will be added to props as key / 457 value pairs with the key a toString() of the appropriate DatumPath for the 458 location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and 459 the value the corresponding text. So, after calling parseMessage 460 successfully, if you wanted to retrieve the message data from props you 461 might call something like 462 props.getProperty((new DatumPath()).add("MSH").add(1).toString()) 463 and that would return a String with "|", probably. 464 465 Note that this package facilitates the extraction of message data in a way 466 independent of message version (i.e. components and whatever getting added): 467 468 With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>", 469 "ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at 470 DatumPath.toString()) 471 472 So if you, coding for a future version of the FOO message but 473 recieving old-version message data, tried 474 props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString()) 475 with the message above (that is, trying to extract a repetition and 476 component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to 477 "fieldy-field-field" in the resulting props. 478 479 If the message was 480 "<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>" 481 and you, coding for an old version of this FOO message but recieving 482 new-version FOO message data, tried 483 props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString()) 484 you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting 485 props. 486 487 msgMask lets you specify which parts of the message you want dumped to props. 488 Passing in null gets you everything. Otherwise, msgMask's elements should 489 all be DatumPaths (! => ClassCastException), and a particular part of the 490 message will be dumped to props only if it's location, as represented by a 491 DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of 492 msgMask. So if one element of msgMask was a (new DatumPath()).add(new 493 String("ZYX")), then everything in all ZYX segment would get dumped to props. 494 A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first 495 repetitions of same (if there is one) dumped to props. etc. etc. Note that 496 a DatumPath of size() == 0 in msgMask will get you everything, no matter what 497 the other elements of msgMask are, because all DatumPaths startsWith the 498 zero-length DatumPath. 499 500 Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they 501 aren't addressed in msgMask or in the output in props -- basically any 502 element tags at the level immediately inside the message element, and having 503 a name that starts with the message element name + '.', is ignored (meaning 504 it's contents are dealt with the same as if the start and end tags' just 505 wasn't there.) 506 */ 507 public static boolean parseMessage(Properties props, String message, 508 Collection msgMask) throws HL7Exception 509 { 510 boolean ret = false; 511 try { 512 SAXParserFactory factory = SAXParserFactory.newInstance(); 513 SAXParser parser = factory.newSAXParser(); 514 515 InputSource inSrc = new InputSource(new java.io.StringReader(message)); 516 517 HL7MessageHandler handler = new HL7MessageHandler(); 518 handler.m_props = (props != null 519 ? props : new Properties()); // it's expecting a props. 520 521 if(msgMask != null) 522 handler.m_msgMask = msgMask; 523 else { 524 handler.m_msgMask = new Vector(); 525 handler.m_msgMask.add(new DatumPath()); 526 } 527 528 parser.parse(inSrc, handler); 529 ret = true; 530 } catch (ParserConfigurationException e) { 531 throw new HL7Exception(e); 532 } catch (IOException e) { 533 throw new HL7Exception(e); 534 } catch (StopParsingException e) { 535 throw new HL7Exception(e); 536 } catch (SAXException e) { 537 throw new HL7Exception(e); 538 } 539 540 return ret; 541 } 542 543 public static void main(String args[]) 544 { 545 if(args.length >= 1) { 546 Properties props = new Properties(); 547 Vector msgMask = new Vector(); 548 msgMask.add((new DatumPath()).add("MSH").add(0).add(9)); 549 //msgMask.add(new DatumPath()); 550 boolean parseret; 551 try { 552 parseret = XML.parseMessage(props, args[0], msgMask); 553 System.err.println("parseMessage returned " + parseret); 554 } catch (HL7Exception e) { 555 e.printStackTrace(); 556 } 557 props.list(System.err); 558 } 559 } 560 } 561