1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.sax.BodyContentHandler;
25 import org.apache.tika.sax.TeeContentHandler;
26 import org.apache.tika.utils.RegexUtils;
27 import org.xml.sax.ContentHandler;
28 import org.xml.sax.SAXException;
29
30
31
32
33
34
35
36 public class ParserPostProcessor extends ParserDecorator {
37
38
39
40
41
42
43 public ParserPostProcessor(Parser parser) {
44 super(parser);
45 }
46
47
48
49
50
51 public void parse(
52 InputStream stream, ContentHandler handler,
53 Metadata metadata, ParseContext context)
54 throws IOException, SAXException, TikaException {
55 ContentHandler body = new BodyContentHandler();
56 ContentHandler tee = new TeeContentHandler(handler, body);
57 super.parse(stream, tee, metadata, context);
58
59 String content = body.toString();
60 metadata.set("fulltext", content);
61
62 int length = Math.min(content.length(), 500);
63 metadata.set("summary", content.substring(0, length));
64
65 for (String link : RegexUtils.extractLinks(content)) {
66 metadata.add("outlinks", link);
67 }
68 }
69
70 }