1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.odf;
18
19 import org.apache.tika.metadata.Metadata;
20 import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
21 import org.apache.tika.parser.xml.DcXMLParser;
22 import org.apache.tika.parser.xml.MetadataHandler;
23 import org.apache.tika.sax.TeeContentHandler;
24 import org.apache.tika.sax.xpath.CompositeMatcher;
25 import org.apache.tika.sax.xpath.Matcher;
26 import org.apache.tika.sax.xpath.MatchingContentHandler;
27 import org.apache.tika.sax.xpath.XPathParser;
28 import org.xml.sax.ContentHandler;
29
30
31
32
33 public class OpenDocumentMetaParser extends DcXMLParser {
34
35 private static final XPathParser META_XPATH = new XPathParser(
36 "meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
37
38 public static final String USER_DEFINED_METADATA_NAME_PREFIX = "custom:";
39
40 private static ContentHandler getMeta(
41 ContentHandler ch, Metadata md, String name, String element) {
42 Matcher matcher = new CompositeMatcher(
43 META_XPATH.parse("//meta:" + element),
44 META_XPATH.parse("//meta:" + element + "//text()"));
45 ContentHandler branch =
46 new MatchingContentHandler(new MetadataHandler(md, name), matcher);
47 return new TeeContentHandler(ch, branch);
48 }
49
50 private static ContentHandler getUserDefined(
51 ContentHandler ch, Metadata md) {
52 Matcher matcher = new CompositeMatcher(
53 META_XPATH.parse("//meta:user-defined/@meta:name"),
54 META_XPATH.parse("//meta:user-defined//text()"));
55 ContentHandler branch = new MatchingContentHandler(
56 new AttributeDependantMetadataHandler(md, "meta:name", USER_DEFINED_METADATA_NAME_PREFIX),
57 matcher);
58 return new TeeContentHandler(ch, branch);
59 }
60
61 private static ContentHandler getStatistic(
62 ContentHandler ch, Metadata md, String name, String attribute) {
63 Matcher matcher =
64 META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
65 ContentHandler branch =
66 new MatchingContentHandler(new MetadataHandler(md, name), matcher);
67 return new TeeContentHandler(ch, branch);
68 }
69
70 protected ContentHandler getContentHandler(ContentHandler ch, Metadata md) {
71
72 ch = super.getContentHandler(ch, md);
73
74 ch = getMeta(ch, md, Metadata.CREATION_DATE, "creation-date");
75 ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
76 ch = getMeta(ch, md, Metadata.EDIT_TIME, "editing-duration");
77 ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
78 ch = getMeta(ch, md, "initial-creator", "initial-creator");
79 ch = getMeta(ch, md, "generator", "generator");
80
81 ch = getUserDefined(ch, md);
82
83 ch = getStatistic(ch, md, "nbTab", "table-count");
84 ch = getStatistic(ch, md, "nbObject", "object-count");
85 ch = getStatistic(ch, md, "nbImg", "image-count");
86 ch = getStatistic(ch, md, "nbPage", "page-count");
87 ch = getStatistic(ch, md, "nbPara", "paragraph-count");
88 ch = getStatistic(ch, md, "nbWord", "word-count");
89 ch = getStatistic(ch, md, "nbCharacter", "character-count");
90 ch = new NSNormalizerContentHandler(ch);
91 return ch;
92 }
93
94 }