我有一个文本文档,其中可能有50个网址,我想要页面标题。 I've been trying to use some code that does do that,但编译器似乎在扫描器与字符串参数上绊倒。理想情况下,我想接收URL的文本文档并将相应的页面标题转储到另一个文档。
import java.io.*;
import java.util.*;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//code from: http://www.gotoquiz.com/web-coding/programming/java-programming/how-to-extract-titles-from-web-pages-in-java/
public class TitleExtractor {
/* the CASE_INSENSITIVE flag accounts for
* sites that use uppercase title tags.
* the DOTALL flag accounts for sites that have
* line feeds in the title text */
private static final Pattern TITLE_TAG =
Pattern.compile("\\<title>(.*)\\</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
/**
* @param x the HTML page
* @return title text (null if document isn't HTML or lacks a title tag)
* @throws IOException
*/
public static String getPageTitle(String url) throws IOException {
URL u = new URL(url);
URLConnection conn = u.openConnection();
// ContentType is an inner class defined below
ContentType contentType = getContentTypeHeader(conn);
if (!contentType.contentType.equals("text/html"))
return null; // don't continue if not HTML
else {
// determine the charset, or use the default
Charset charset = getCharset(contentType);
if (charset == null)
charset = Charset.defaultCharset();
// read the response body, using BufferedReader for performance
InputStream in = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset));
int n = 0, totalRead = 0;
char[] buf = new char[1024];
StringBuilder content = new StringBuilder();
// read until EOF or first 8192 characters
while (totalRead < 8192 && (n = reader.read(buf, 0, buf.length)) != -1) {
content.append(buf, 0, n);
totalRead += n;
}
reader.close();
// extract the title
Matcher matcher = TITLE_TAG.matcher(content);
if (matcher.find()) {
/* replace any occurrences of whitespace (which may
* include line feeds and other uglies) as well
* as HTML brackets with a space */
return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim();
}
else
return null;
}
}
/**
* Loops through response headers until Content-Type is found.
* @param conn
* @return ContentType object representing the value of
* the Content-Type header
*/
private static ContentType getContentTypeHeader(URLConnection conn) {
int i = 0;
boolean moreHeaders = true;
do {
String headerName = conn.getHeaderFieldKey(i);
String headerValue = conn.getHeaderField(i);
if (headerName != null && headerName.equals("Content-Type"))
return new ContentType(headerValue);
i++;
moreHeaders = headerName != null || headerValue != null;
}
while (moreHeaders);
return null;
}
private static Charset getCharset(ContentType contentType) {
if (contentType != null && contentType.charsetName != null && Charset.isSupported(contentType.charsetName))
return Charset.forName(contentType.charsetName);
else
return null;
}
/**
* Class holds the content type and charset (if present)
*/
private static final class ContentType {
private static final Pattern CHARSET_HEADER = Pattern.compile("charset=([-_a-zA-Z0-9]+)", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
private String contentType;
private String charsetName;
private ContentType(String headerValue) {
if (headerValue == null)
throw new IllegalArgumentException("ContentType must be constructed with a not-null headerValue");
int n = headerValue.indexOf(";");
if (n != -1) {
contentType = headerValue.substring(0, n);
Matcher matcher = CHARSET_HEADER.matcher(headerValue);
if (matcher.find())
charsetName = matcher.group(1);
}
else
contentType = headerValue;
}
}
public class readfile{
private Scanner x;
public void openFile(){
try{
x = new Scanner(new File("urls.txt"));
}
catch(Exception e){
System.out.println("could not find file");
}
}
public void readFile() throws IOException{
while(x.hasNext()){
//String title = TitleExtractor.getPageTitle(x);
String a = x.next();
String title = TitleExtractor.getPageTitle(a);
System.out.printf("%s\n",title);
}
}
public void closeFile(){
x.close();
}
}
public static void main(String[] args) throws IOException{
readfile r = new readfile();
r.openFile();
r.readFile();
r.closeFile();
}
}
我在readfile上遇到编译错误r = new readfile();位:
&#34;无法访问类型为TitleExtractor的封闭实例。必须使用TitleExtractor&#34;
类型的封闭实例限定分配我不太确定代码在扫描程序和字符串变量之间的混淆程度。
答案 0 :(得分:0)
readfile
是一个非静态内部类,因此无需引用TitleExtractor
的实例即可实例化它。最简单的解决方案是将类声明更改为public static class readfile
。