View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.utils;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.regex.Matcher;
23  import java.util.regex.Pattern;
24  
25  /**
26   * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
27   * content
28   * 
29   * 
30   */
31  public class RegexUtils {
32  
33      /**
34       * Regex pattern to get URLs within a plain text.
35       * 
36       * @see <a
37       *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
38       *      </a>
39       */
40      private static final String LINKS_REGEX =
41          "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
42          + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
43          + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
44      
45      private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);
46  
47      /**
48       * Extract urls from plain text.
49       *
50       * @param content The plain text content to examine
51       * @return List of urls within found in the plain text
52       */
53      public static List<String> extractLinks(String content) {
54          if (content == null || content.length() == 0) {
55              return Collections.emptyList();
56          }
57  
58          List<String> extractions = new ArrayList<String>();
59          final Matcher matcher = LINKS_PATTERN.matcher(content);
60          while (matcher.find()) {
61              extractions.add(matcher.group());
62          }
63          return extractions;
64  
65      }
66  }