我正在尝试创建一个程序,用于在文本文件中查找任何类型的URL,例如hxxp://www.testsite.com/images/logo.png。下面的代码是我尝试增加在线教程,(主要是在我的代码之上引用。)但是我无法捕获任何/所有URL,包括嵌入在html标记内的URL。我会感谢任何有关我可以尝试的帮助或建议。谢谢。
/* Reference: http://www.vogella.com/tutorials/JavaRegularExpressions/article.html
*/
package de.vogella.regex.weblinks;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LinkGetter {
//variables
private Pattern htmltag;
private Pattern link;
public LinkGetter() {
//Patterns.
htmltag = Pattern.compile("<a\\b[^>]*href=\"[^>]*>(.*?)</a>");
link = Pattern.compile(
"((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)");
}
public static void main(String[] args){
String filepath ="TestFile.rtf";
System.out.println(new LinkGetter().getLinks(filepath));
}
public List<String> getLinks(String filepath) {
List<String> links = new ArrayList<String>();
try {
FileReader FR = new FileReader("TestFile.rtf");
BufferedReader bufferedReader = new BufferedReader(FR);
String A;
StringBuilder builder = new StringBuilder();
while ((A = bufferedReader.readLine()) != null) {
builder.append(A);
}
Matcher tagmatch = htmltag.matcher(builder.toString());
while (tagmatch.find()) {
Matcher matcher = link.matcher(tagmatch.group());
matcher.find();
String link = matcher.group().replaceFirst("href=\"", "")
.replaceFirst("\">", "")
.replaceFirst("\"[\\s]?target=\"[a-zA-Z_0-9]*", "");
if (valid(link)) {
links.add(makeAbsolute(filepath, link));
}
bufferedReader.close();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return links;
}
private boolean valid(String A) {
if (A.matches("javascript:.*|mailto:.*")) {
return false;
}
return true;
}
private String makeAbsolute(String filepath, String link) {
if (link.matches("http://.*")) {
return link;
}
else if (link.matches("/.*") || link.matches(".*$[^/]")) {
return "/" + link;
throw new RuntimeException("Cannot make absolute. File: " + filepath
+ " Link " + link);
}
}
答案 0 :(得分:1)
您可以尝试:
(https?|ftp|gopher|telnet|file):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?
答案 1 :(得分:0)
我在我的一个旧项目中使用过这个。应该工作。
String regex="\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
Pattern pattern = Pattern.compile(regex);