1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.xml;
18
19 import org.apache.tika.metadata.DublinCore;
20 import org.apache.tika.metadata.Metadata;
21 import org.apache.tika.sax.TeeContentHandler;
22 import org.apache.tika.sax.xpath.CompositeMatcher;
23 import org.apache.tika.sax.xpath.Matcher;
24 import org.apache.tika.sax.xpath.MatchingContentHandler;
25 import org.apache.tika.sax.xpath.XPathParser;
26 import org.xml.sax.ContentHandler;
27
28
29
30
31 public class DcXMLParser extends XMLParser {
32
33 private static final XPathParser DC_XPATH = new XPathParser(
34 "dc", "http://purl.org/dc/elements/1.1/");
35
36 private static ContentHandler getDublinCore(
37 ContentHandler ch, Metadata md, String name, String element) {
38 Matcher matcher = new CompositeMatcher(
39 DC_XPATH.parse("//dc:" + element),
40 DC_XPATH.parse("//dc:" + element + "//text()"));
41 ContentHandler branch =
42 new MatchingContentHandler(new MetadataHandler(md, name), matcher);
43 return new TeeContentHandler(ch, branch);
44 }
45
46 protected ContentHandler getContentHandler(ContentHandler ch, Metadata md) {
47 ch = super.getContentHandler(ch, md);
48 ch = getDublinCore(ch, md, DublinCore.TITLE, "title");
49 ch = getDublinCore(ch, md, DublinCore.SUBJECT, "subject");
50 ch = getDublinCore(ch, md, DublinCore.CREATOR, "creator");
51 ch = getDublinCore(ch, md, DublinCore.DESCRIPTION, "description");
52 ch = getDublinCore(ch, md, DublinCore.PUBLISHER, "publisher");
53 ch = getDublinCore(ch, md, DublinCore.CONTRIBUTOR, "contributor");
54 ch = getDublinCore(ch, md, DublinCore.DATE, "date");
55 ch = getDublinCore(ch, md, DublinCore.TYPE, "type");
56 ch = getDublinCore(ch, md, DublinCore.FORMAT, "format");
57 ch = getDublinCore(ch, md, DublinCore.IDENTIFIER, "identifier");
58 ch = getDublinCore(ch, md, DublinCore.LANGUAGE, "language");
59 ch = getDublinCore(ch, md, DublinCore.RIGHTS, "rights");
60 return ch;
61 }
62
63 }