1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.utils; 18 19 import java.util.ArrayList; 20 import java.util.Collections; 21 import java.util.List; 22 import java.util.regex.Matcher; 23 import java.util.regex.Pattern; 24 25 /** 26 * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract 27 * content 28 * 29 * 30 */ 31 public class RegexUtils { 32 33 /** 34 * Regex pattern to get URLs within a plain text. 35 * 36 * @see <a 37 * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html 38 * </a> 39 */ 40 private static final String LINKS_REGEX = 41 "([A-Za-z][A-Za-z0-9+.-]{1,120}:" 42 + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" 43 + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; 44 45 private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE); 46 47 /** 48 * Extract urls from plain text. 49 * 50 * @param content The plain text content to examine 51 * @return List of urls within found in the plain text 52 */ 53 public static List<String> extractLinks(String content) { 54 if (content == null || content.length() == 0) { 55 return Collections.emptyList(); 56 } 57 58 List<String> extractions = new ArrayList<String>(); 59 final Matcher matcher = LINKS_PATTERN.matcher(content); 60 while (matcher.find()) { 61 extractions.add(matcher.group()); 62 } 63 return extractions; 64 65 } 66 }