001    package ca.uhn.hl7v2.preparser;
002    
003    import java.util.*;
004    import java.io.*;
005    import javax.xml.parsers.*;
006    import org.xml.sax.*;
007    import org.xml.sax.helpers.*;
008    
009    import ca.uhn.hl7v2.HL7Exception;
010    
011    public class XML
012    {
013            protected static class StopParsingException extends SAXException
014            {
015                    public StopParsingException() 
016                    {
017                            super("ca.uhn.hl7.....StopParsingException");
018                    }
019            }
020    
021            /** the SAXParser reports parsing events to an object of this class.
022            We keep track of some parsing state, and the Properties object that 
023            we're supposed to write our data to.
024            */
025            static protected class HL7MessageHandler extends DefaultHandler 
026            {
027                    /* m_props & m_msgMask should be set by the user of this handler before
028                    they pass this handler to SAXParser.parse() or whatever */
029    
030                    /** The data that is found while parsing, and which passes m_msgMask, 
031                    will be dumped to m_props, as (DatumPath.toString() / text) key/value
032                    pairs */
033                    public Properties m_props = null;
034    
035                    /** Specifies what parts of a message should be dumped to m_props. 
036                    */
037                    public Collection m_msgMask = null;
038    
039                    /* All other fields are parser state. */
040    
041                    protected boolean m_startedDocument = false;
042    
043                    /* m_msgID / m_curPath together keep track of where we are in the document.
044    
045                    If m_msgID.length() != 0, then we're within the message element.  (We're only
046                    expecting one message per document.)  Then m_msgID will be the name of the 
047                    message.  ("ACK" or whatever).  
048    
049                    m_curPath keeps track of where within the message we are.  See notes at 
050                    DatumPath class definition.  If m_curPath.size() != 0, then we must be 
051                    within a message.
052    
053                    At any point in the code below: 
054    
055                    if m_msgID.length() == 0, 
056                            then m_curPath().size() == 0
057    
058                    if m_curPath.length()  != 0
059                            then m_msgID.length() != 0
060                    
061                    Note that our DatumPaths count indices starting from 0 (not 1) -- they're 
062                    only converted to 1-based in the string representations that wind up 
063                    as m_props keys.
064                    */
065                    StringBuffer m_msgID = new StringBuffer();
066                    DatumPath m_curPath = new DatumPath();
067    
068                    /* the location in the document of the last datum we dumped to m_props. */
069                    DatumPath m_lastDumpedPath = new DatumPath();
070    
071                    /** For handling repeat segments.   segmentID (String) -> next repeat idx
072                    (Integer).  So when we hit a segment ZYX, we'll know how many times we've
073                    hit a ZYX before, and set the segmentRepIdx part of m_curPath
074                    appropriately. */
075                    TreeMap m_segmentId2nextRepIdx = new TreeMap();
076    
077                    /* m_depthWithinUselessElement and m_depthWithinUsefulElement 
078                    reflect what m_msgMask thinks about our location in the document at any
079                    given time.  
080    
081                    Both should always be >= -1.  Note that both can be >= 0 at the same time
082                    -- explained in a minute....
083    
084                    If m_depthWithinUsefulElement >= 0, this means that we are however deep
085                    (in terms of nested elements: 0 => just within) within an area of the
086                    message that passes m_msgMask.  We should should dump whatever we find
087                    there to m_props.  As we move around within such an element, we will still
088                    update m_curPath appropriately.
089    
090                    If m_depthWithinUsefulElement >= 0, we are however deep within an element
091                    which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1>
092                    -- a few other things maybe), or more importantly that we're within an
093                    element that otherwise has no hope of having any useful elements within it
094                    according to m_msgMask.  (eg. m_msgMask says it wants only ZYX segment
095                    contents, we're in an <MSH>).  So we can safely ignore all content within,
096                    and just keep track of how deep we are within this useless element (with
097                    m_depthWithinUselessElement, of course.)  We don't update m_curPath when
098                    m_depthWithinUselessElement >= 0, there's no point and how would we
099                    extract information for the DatumPath out of nonsensical element names
100                    anyway.
101    
102                    If they are both >= 0, this means that there we've found some useless
103                    stuff (nonsensical element names?) within a known-useful element.
104                    */
105                    int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1;
106    
107                    /* With this we keep the text that we've found within a certain element.
108                    It's cleared whenever we enter a (sub) element or leave an element. */
109                    StringBuffer m_chars = new StringBuffer(10);
110    
111                    public HL7MessageHandler()
112                    {
113                            this.clear();
114                    }
115    
116                    void clear()
117                    {
118                            // reset the state (m_props & m_msgMask are not state)
119                            m_startedDocument = false;
120                            m_msgID.delete(0, m_msgID.length());
121                            m_curPath.clear();
122                            // will always be "less than" (according to DatumPath.numbersLessThan)
123                            // any sensible DatumPath: 
124                            m_lastDumpedPath.clear().add(new String()).add(-42).add(-42).add(-42).add(-42).add(-42);
125                            m_segmentId2nextRepIdx.clear();
126                            m_depthWithinUsefulElement = -1;
127                            m_depthWithinUselessElement = -1;
128                            m_chars.delete(0, m_chars.length());
129                    }
130    
131                    public void startDocument() throws SAXException
132                    {
133                            boolean ok = false;
134                            if(!m_startedDocument && (m_props != null)) {
135                                    m_startedDocument = true;
136                                    ok = true;
137                            }
138    
139                            if(!ok) {
140                                    clear();
141                                    throw new StopParsingException();
142                            }
143                    }
144    
145                    public void endDocument() throws SAXException
146                    {
147                            boolean ok = false;
148                            if(m_startedDocument) {
149                                    this.clear();
150                                    ok = true;
151                            }
152    
153                            if(!ok) {
154                                    clear();
155                                    throw new StopParsingException();
156                            }
157                    }
158    
159                    public void startElement(String uri, String localName, String qName, 
160                                    Attributes attributes) throws SAXException 
161                    {
162                            //System.err.println("startelem: " + qName + " curpathsize; " +
163                            //m_curPath.size());
164                            boolean ok = false;
165                            if(m_startedDocument) {
166                                    // A single unit of text data will be within a single element, 
167                                    // -- none of it will be in sub-elements and there will be no 
168                                    // sub-elements fragmenting the data text.
169                                    // Right now we're entering a new element: this means that anything
170                                    // in m_chars will be whitespace (likely), or text left over from, 
171                                    // say, the last field, or text that was somewhere it shouldn't have been.
172                                    // (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>"
173                                    m_chars.delete(0, m_chars.length());
174    
175                                    if(m_depthWithinUselessElement >= 0) {
176                                            ++m_depthWithinUselessElement;
177                                    }
178                                    else {
179                                            int oldCurPathSize = m_curPath.size();
180                                            if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath, 
181                                                    m_segmentId2nextRepIdx, m_lastDumpedPath, qName)) 
182                                            {
183                                                    if(m_curPath.size() > oldCurPathSize) {
184                                                            // assert (m_depthWithinUselessElement == -1) // m_curPath
185                                                            // should not have grown if we're within a useless element.
186                                                            if(m_depthWithinUsefulElement == -1) {
187                                                                    // this new element could match one of the DatumPaths in
188                                                                    // m_msgMask -- if that's the case, we've just entered a
189                                                                    // useful element.
190                                                                    // TODO: functional stylee (a la C++'s std::accumulate) ? 
191                                                                    boolean curPathStartsWithAMaskElem = false;
192                                                                    for(Iterator maskIt = m_msgMask.iterator(); 
193                                                                            !curPathStartsWithAMaskElem && maskIt.hasNext(); )
194                                                                    {
195                                                                            curPathStartsWithAMaskElem 
196                                                                                    = m_curPath.startsWith((DatumPath)maskIt.next());
197                                                                    }
198    
199                                                                    if(curPathStartsWithAMaskElem) 
200                                                                            m_depthWithinUsefulElement = 0;
201                                                                    else {
202                                                                            // so this element we're entering is not specified by m_msgMask
203                                                                            // to be useful -- but might it contains elements that
204                                                                            // are?
205                                                                            boolean aMaskElemStartsWithCurPath = false;
206                                                                            for(Iterator maskIt = m_msgMask.iterator(); 
207                                                                                    !aMaskElemStartsWithCurPath && maskIt.hasNext(); )
208                                                                            {
209                                                                                    aMaskElemStartsWithCurPath 
210                                                                                            = ((DatumPath)maskIt.next()).startsWith(m_curPath);
211                                                                            }
212    
213                                                                            if(!aMaskElemStartsWithCurPath) {
214                                                                                    // ... nope!  useless.
215                                                                                    m_depthWithinUselessElement = 0;
216                                                                                    m_curPath.setSize(oldCurPathSize);
217                                                                            } // else => ok, carry on, m_depthWithinUse{less,ful}Element
218                                                                            // still both -1.
219                                                                    }
220                                                            }
221                                                            // else => already within a useful element, don't need to compare 
222                                                            // against m_msgMask.
223                                                    }
224                                            }
225                                            else
226                                                    m_depthWithinUselessElement = 0;
227                                    }
228                                    ok = true;
229                            }
230    
231                            if(!ok) {
232                                    clear();
233                                    throw new StopParsingException();
234                            }
235                    }
236    
237                    /* doc location == msgID & curPath together.  
238                    If we've encountered an element called "elementNam", then this tries 
239                    to determine what it is, based on what we already know about the document.
240                    returns true if we can make sense of this new element name given the
241                    position we're at (represented by msgID / curPath), 
242                    false if we can't (which probably means this should be a useless element). 
243                    returning true doesn't mean that we actually changed msgID or curPath, it
244                    might mean that we just passed through a segment group element OK.
245                    */
246                    protected static boolean tryToGrowDocLocationFromElementName(
247                            StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/, 
248                            Map segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/, 
249                            String elementName /*in*/)
250                    {
251                            boolean ok = false; // ok == can we make sense of this new element?
252                            // hmm ... where are we in the document: 
253                            if((msgID.length() == 0) && (curPath.size() == 0)) {
254                                    // we're entering a message
255                                    msgID.replace(0, msgID.length(), elementName);
256                                    segmentId2nextRepIdx.clear();
257                                    ok = true;
258                            }
259                            else if((msgID.length() > 0) && (curPath.size() == 0)) {
260                                    // we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>)
261                                    // or an actual segment element.
262                                    if(!(elementName.startsWith("" + msgID + '.'))) {
263                                            // must be an actual segment.
264                                            curPath.add(elementName);
265    
266                                            if(segmentId2nextRepIdx.containsKey(elementName)) 
267                                                    curPath.add(segmentId2nextRepIdx.get(elementName));
268                                            else
269                                                    curPath.add(new Integer(0));
270    
271                                            segmentId2nextRepIdx.put(elementName, 
272                                                    new Integer(((Integer)curPath.get(curPath.size()-1)).intValue() + 1));
273                                    }
274                                    ok = true;
275                            }
276                            else if((msgID.length() > 0) && (curPath.size() > 0)) {
277                                    // we're entering a field or a component or a subcomponent.
278                                    if(curPath.size() == 2) { // we're entering a field element
279                                            // all fields should start with segment-ID + '.' 
280                                            if(elementName.startsWith("" + curPath.get(0) + '.')) {
281                                                    try {
282                                                            int fieldIdxFromElementName 
283                                                                    = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
284    
285                                                            curPath.add(new Integer(fieldIdxFromElementName));
286    
287                                                            // now add the repetition idx to curPath: 
288                                                            if((lastDumpedPath.size() >= 4) 
289                                                                    && (((Integer)lastDumpedPath.get(2)).intValue() 
290                                                                            == fieldIdxFromElementName))
291                                                            {
292                                                                    // lastDumpedPath has a fieldIdx and a fieldRepIdx.
293                                                                    curPath.add(new Integer(((Integer)lastDumpedPath.get(3)).intValue() + 1));
294                                                            }
295                                                            else
296                                                                    curPath.add(new Integer(0));
297    
298                                                            ok = true;
299                                                    } catch(NumberFormatException e) {}
300                                            } // else => this isn't a field -- must be useless.
301                                    }
302                                    else if((curPath.size() == 4) || (curPath.size() == 5)) {
303                                            // we're entering a component or subcomponent element
304                                            try {
305                                                    int idxFromElementName 
306                                                            = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
307                                                    curPath.add(new Integer(idxFromElementName));
308                                                    ok = true;
309                                            } catch(NumberFormatException e) {}
310                                    }
311                            }
312                            return ok;
313                    }
314    
315                    public void endElement(String uri, String localName, String qName) 
316                            throws SAXException 
317                    {
318                            //System.err.println("endElement: " + qName);
319                            boolean ok = false;
320                            if(m_startedDocument) {
321                                    if(m_depthWithinUselessElement >= 0) {
322                                            --m_depthWithinUselessElement;
323                                            ok = true;
324                                    }
325                                    else {
326                                            if((m_msgID.length() > 0) && (m_curPath.size() == 0)) {
327                                                    // we're exiting either a message element or a 
328                                                    // segment group element.
329                                                    if((""+qName).compareTo(""+m_msgID) == 0)
330                                                            m_msgID.delete(0, m_msgID.length()); // => exiting message element
331                                                    // else => segment group element -- do nothing.
332    
333                                                    ok = true;
334                                            }
335                                            else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) {
336                                                    tryToDumpDataToProps();
337    
338                                                    if(m_curPath.size() == 2) {
339                                                            // exiting a segment element
340                                                            m_curPath.setSize(0);
341                                                            ok = true;
342                                                    }
343                                                    else if(m_curPath.size() == 4) {
344                                                            // exiting a field element 
345                                                            m_curPath.setSize(2);
346                                                            ok = true;
347                                                    }
348                                                    else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) {
349                                                            // exiting a component or a subcomponent
350                                                            m_curPath.setSize(m_curPath.size() - 1);
351                                                            ok = true;
352                                                    }
353                                            }
354    
355                                            if(m_depthWithinUsefulElement >= 0) 
356                                                    --m_depthWithinUsefulElement;
357                                    }
358                            }
359    
360                            if(!ok) {
361                                    clear();
362                                    throw new StopParsingException();
363                            }
364                    }
365    
366                    /** try to dump whatever we've got in m_chars to m_props, 
367                    with a key of m_curPath.toString(). 
368                    */
369                    protected void tryToDumpDataToProps()
370                    {
371                            if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) {
372                                    /* m_curPath.toString() will be the property key whose value will be
373                                    m_chars.
374    
375                                    This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9>
376                                    <PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something
377                                    like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element.  (note: internal
378                                    DatumPath elements are 0-indexed, string representations of DatumPaths and
379                                    the XML text is 1-indexed.)  So in m_props the key for "P" would have been
380                                    "ZYX[0]-9[0]-1-1".  (the last "-1" is a default that got added by
381                                    toString()).
382                                    
383                                    Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0,
384                                    9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when
385                                    exiting the ZYX.9 element, we might have written that whitespace to m_props
386                                    with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1":
387                                    the same as the key for the "P" ... clobbering "P" in m_props with
388                                    whitespace.
389    
390                                    But since we know that HL7 fields / components / etc are always in order
391                                    (numerically), we can count on m_lastDumpedPath and use
392                                    DatumPath.numbersLessThan to avoid the clobbering.
393                                    */
394                                    if((m_lastDumpedPath.get(0).equals(m_curPath.get(0))) 
395                                                    ? (m_lastDumpedPath.numbersLessThan(m_curPath)) 
396                                                    : true)
397                                    {
398                                            if(m_depthWithinUsefulElement >= 0) {
399                                                    // TODO: remove!  or assert 
400                                                    if(m_props.containsKey("" + m_curPath)) 
401                                                            System.err.println("ALAAAARM: CLOBBERING PROPERTY in " + getClass());
402    
403                                                    m_props.setProperty("" + m_curPath, "" + m_chars);
404                                                    m_lastDumpedPath.copy(m_curPath);
405                                                    m_chars.delete(0, m_chars.length());
406                                            }
407                                    }
408                            }
409                    }
410    
411                    public void characters(char[] chars, int start, int length)
412                    {
413                            // note that a contiguous run of characters in the document 
414                            // might get reported to us in several chunks. 
415                            // (In the order that the text appears in the document, 
416                            // non-overlapping and with no gaps between chunks.) 
417                            // An entity like &amp; will reach us as an actual & character.
418                            
419                            if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) {
420                                    m_chars.append(chars, start, length);
421                            }
422                    }
423    
424                    public void ignoreableWhitespace(char []chars, int start, int length)
425                    {
426                            // it's unclear which whitespace is considered ignorable for us.  
427                            // what the heck, add it to m_chars. 
428                            characters(chars, start, length);
429                    }
430    
431                    public void error(SAXParseException e)
432                    {
433                            // TODO: remove.
434                            System.err.println("Error in " + getClass() + ": " + e);
435                    }
436    
437                    public void fatalError(SAXParseException e) throws SAXException 
438                    {
439                            throw e;
440                    }
441            }
442    
443            /** parse message according to our HL7 XML handler, and dump the data found
444            to props.  
445            
446            returns true if we parsed ok, which means well-formed XML, and
447            that's about it.  We just barely check against HL7 structure, and ignore any
448            elements / text that is unexpected (that is, impossible in any HL7 message:
449            independant of any message / segment definitions).
450    
451            "message" should be an XML document with one top-level element -- that being
452            the message.  (<ACK> or whatever).  We're only expecting one message to be in
453            "message".
454    
455            props can be null if you don't want the data (we still parse).  The message
456            data found in message (that passes msgMask) will be added to props as key /
457            value pairs with the key a toString() of the appropriate DatumPath for the
458            location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and
459            the value the corresponding text.  So, after calling parseMessage
460            successfully, if you wanted to retrieve the message data from props you
461            might call something like 
462            props.getProperty((new DatumPath()).add("MSH").add(1).toString())
463            and that would return a String with "|", probably.
464    
465            Note that this package facilitates the extraction of message data in a way
466            independent of message version (i.e. components and whatever getting added):
467    
468            With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>",
469            "ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at
470            DatumPath.toString())
471    
472            So if you, coding for a future version of the FOO message but
473            recieving old-version message data, tried
474            props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString()) 
475            with the message above (that is, trying to extract a repetition and
476            component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to 
477            "fieldy-field-field" in the resulting props.  
478    
479            If the message was
480            "<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>"
481            and you, coding for an old version of this FOO message but recieving
482            new-version FOO message data, tried 
483            props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString())
484            you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting 
485            props.
486    
487            msgMask lets you specify which parts of the message you want dumped to props.
488            Passing in null gets you everything.  Otherwise, msgMask's elements should
489            all be DatumPaths (! => ClassCastException), and a particular part of the
490            message will be dumped to props only if it's location, as represented by a
491            DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of
492            msgMask.  So if one element of msgMask was a (new DatumPath()).add(new
493            String("ZYX")), then everything in all ZYX segment would get dumped to props.
494            A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first
495            repetitions of same (if there is one) dumped to props.  etc. etc.  Note that
496            a DatumPath of size() == 0 in msgMask will get you everything, no matter what
497            the other elements of msgMask are, because all DatumPaths startsWith the
498            zero-length DatumPath.
499    
500            Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they
501            aren't addressed in msgMask or in the output in props -- basically any
502            element tags at the level immediately inside the message element, and having
503            a name that starts with the message element name + '.', is ignored (meaning
504            it's contents are dealt with the same as if the start and end tags' just 
505            wasn't there.)
506            */
507            public static boolean parseMessage(Properties props, String message, 
508                            Collection msgMask) throws HL7Exception
509            {
510                    boolean ret = false;
511                    try {
512                            SAXParserFactory factory = SAXParserFactory.newInstance();
513                            SAXParser parser = factory.newSAXParser();
514    
515                            InputSource inSrc = new InputSource(new java.io.StringReader(message));
516    
517                            HL7MessageHandler handler = new HL7MessageHandler();
518                            handler.m_props = (props != null 
519                                    ? props : new Properties()); // it's expecting a props.
520    
521                            if(msgMask != null)
522                                    handler.m_msgMask = msgMask;
523                            else {
524                                    handler.m_msgMask = new Vector();
525                                    handler.m_msgMask.add(new DatumPath());
526                            }
527    
528                            parser.parse(inSrc, handler);
529                            ret = true;
530            } catch (ParserConfigurationException e) {
531                throw new HL7Exception(e);
532            } catch (IOException e) {
533                throw new HL7Exception(e);
534            } catch (StopParsingException e) {
535                throw new HL7Exception(e);
536            } catch (SAXException e) {
537                throw new HL7Exception(e);
538            }
539    
540                    return ret;
541            }
542    
543            public static void main(String args[]) 
544            {
545                    if(args.length >= 1) {
546                            Properties props = new Properties();
547                            Vector msgMask = new Vector();
548                            msgMask.add((new DatumPath()).add("MSH").add(0).add(9));
549                            //msgMask.add(new DatumPath());
550                            boolean parseret;
551                try {
552                    parseret = XML.parseMessage(props, args[0], msgMask);
553                    System.err.println("parseMessage returned " + parseret);
554                } catch (HL7Exception e) {
555                    e.printStackTrace();
556                }
557                            props.list(System.err);
558                    }
559            }
560    }
561