我正在尝试从给定的String
中提取URL,其中包含带有HREF标记的HTTP响应。我已经到达链接的开头但是我需要在HREF结束时立即终止字符串。如何实现这一目标?
public class Extracturl {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String line;
try {
String u="http://en.wikipedia.org/wiki/china";
String fileName = "e:\\test.txt";
BufferedWriter writer = new BufferedWriter(new FileWriter(fileName,true));
url = new URL(u);
is = url.openStream(); // throws an IOException
dis = new DataInputStream(new BufferedInputStream(is));
String w=new String();
while ((line = dis.readLine()) != null) {
try {
if(line.contains("href=\"/wiki")&&line.contains("\" />")&& (!line.contains("File")))
{
if(!w.contains(line.substring(line.indexOf("href=\"/"))))
{w=w+line.substring(line.indexOf("href=\"/"));
System.out.println(line.substring(line.indexOf("href=\"/")));
writer.write(w);
writer.newLine();
}}
} catch (IOException e) {
e.printStackTrace();
}
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
is.close();
// writer.close();
} catch (IOException ioe) {
// nothing to see here
}
}
}
}
我甚至尝试过
w=w+line.substring(line.indexOf("href=\"/"),line.indexOf("\">"));
但这给了我错误。
我的目标是获取从该页面链接的所有网址。
答案 0 :(得分:4)
为此目的使用HTML解析器。以下是嵌入式Java HTML解析器的示例。还有其他替代方法,如JSoup,但对于基本的HTML处理,这个方法做得非常好:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.Set;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class URLExtractor {
private static class HTMLPaserCallBack extends HTMLEditorKit.ParserCallback {
private Set<String> urls;
public HTMLPaserCallBack() {
urls = new LinkedHashSet<String>();
}
public Set<String> getUrls() {
return urls;
}
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
handleTag(t, a, pos);
}
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
handleTag(t, a, pos);
}
private void handleTag(Tag t, MutableAttributeSet a, int pos) {
if (t == Tag.A) {
Object href = a.getAttribute(HTML.Attribute.HREF);
if (href != null) {
String url = href.toString();
if (!urls.contains(url)) {
urls.add(url);
}
}
}
}
}
public static void main(String[] args) throws IOException {
InputStream is = null;
try {
String u = "http://en.wikipedia.org/wiki/china";
URL url = new URL(u);
is = url.openStream(); // throws an IOException
HTMLPaserCallBack cb = new HTMLPaserCallBack();
new ParserDelegator().parse(new BufferedReader(new InputStreamReader(is)), cb, true);
for (String aUrl : cb.getUrls()) {
System.out.println("Found URL: " + aUrl);
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
}
}