import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class Main {
public static void main(String[] args) throws Exception {
Document d=Jsoup.connect("https://osu.ppy.sh/u/charless").get();
for(Element line : d.select("div.profileStatLine")) {
System.out.println(d.select("b").text());
}
}
}
我在收到短信时遇到了问题" 2027pp(#97,094)"在div.profileStatLine b。这应该输出,但不是。 网址:https://osu.ppy.sh/u/charless
答案 0 :(得分:1)
页面的某些部分加载了javascript,这就是为什么你无法看到你正在寻找的div。
您可以使用浏览器加载页面并在解析之前解释javascript。像webdrivermanager这样的图书馆会有所帮助。
public static void main(String[] args) throws Exception {
ChromeDriverManager.getInstance().setup();
ChromeDriver chromeDriver = new ChromeDriver();
chromeDriver.get("https://osu.ppy.sh/u/charless");
Document d = Jsoup.parse(chromeDriver.getPageSource());
chromeDriver.close();
for (Element line : d.select("div.profileStatLine")) {
System.out.println(line.select("b").text());
}
}
另一种方法是检查页面中的javascript并进行与检索数据相同的调用。
该页面正在加载https://osu.ppy.sh/pages/include/profile-general.php?u=4084042&m=0
的个人资料。看起来u
只是用户ID,从页面中提取相对简单:
public class ProfileScraper {
private static final Pattern UID_PATTERN = Pattern.compile("var userId = (\\d+);");
public static void main(String[] args) throws IOException {
String uid = getUid("charless");
Document d = Jsoup.connect("https://osu.ppy.sh/pages/include/profile-general.php?u=" + uid).get();
for (Element line : d.select("div.profileStatLine")) {
System.out.println(line.select("b").text());
}
}
public static String getUid(String name) throws IOException {
Document d1 = Jsoup.connect("https://osu.ppy.sh/u/" + name).get();
for (Element script : d1.select("script")) {
String text = script.data();
Matcher uidMatcher = UID_PATTERN.matcher(text);
if (uidMatcher.find()) {
return uidMatcher.group(1);
}
}
throw new IOException("No such character");
}
}