如何提取Facebook页面的所有链接。我可以使用jsoup提取它并传递“like”链接作为参数来提取喜欢该特定页面的所有用户信息
private static String readAll(Reader rd) throws IOException
{
StringBuilder sb = new StringBuilder();
int cp;
while ((cp = rd.read()) != -1)
{
sb.append((char) cp);
}
return sb.toString();
}
public static JSONObject readurl(String url) throws IOException, JSONException
{
InputStream is = new URL(url).openStream();
try
{
BufferedReader rd = new BufferedReader
(new InputStreamReader(is, Charset.forName("UTF-8")));
String jsonText = readAll(rd);
JSONObject json = new JSONObject(jsonText);
return json;
}
finally
{
is.close();
}
}
public static void main(String[] args) throws IOException,
JSONException, FacebookException
{
try
{
System.out.println("\nEnter the search string:");
@SuppressWarnings("resource")
Scanner sc=new Scanner(System.in);
String s=sc.nextLine();
JSONObject json = readurl("https://graph.facebook.com/"+s);
System.out.println(json);
}}
我可以修改这个并整合这个代码。以下代码提取了特定页面的所有链接。我试过上面的代码,但它不工作
String url = "http://www.firstpost.com/tag/crime-in-india";
Document doc = Jsoup.connect(url).get();
Elements links = doc.getElementsByTag("a");
System.out.println(links.size());
for (Element link : links)
{
System.out.println(link.absUrl("href") +trim(link.text(), 35));
}
}
public static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width-1) + ".";
else
return s;
}
}
答案 0 :(得分:1)
你也可以尝试另类方式:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.Set;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class URLExtractor {
private static class HTMLPaserCallBack extends HTMLEditorKit.ParserCallback {
private Set<String> urls;
public HTMLPaserCallBack() {
urls = new LinkedHashSet<String>();
}
public Set<String> getUrls() {
return urls;
}
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
handleTag(t, a, pos);
}
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
handleTag(t, a, pos);
}
private void handleTag(Tag t, MutableAttributeSet a, int pos) {
if (t == Tag.A) {
Object href = a.getAttribute(HTML.Attribute.HREF);
if (href != null) {
String url = href.toString();
if (!urls.contains(url)) {
urls.add(url);
}
}
}
}
}
public static void main(String[] args) throws IOException {
InputStream is = null;
try {
String u = "https://www.facebook.com/";
URL url = new URL(u);
is = url.openStream(); // throws an IOException
HTMLPaserCallBack cb = new HTMLPaserCallBack();
new ParserDelegator().parse(new BufferedReader(new InputStreamReader(is)), cb, true);
for (String aUrl : cb.getUrls()) {
System.out.println("Found URL: " + aUrl);
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
}
}
答案 1 :(得分:1)
有点作品,但我不确定你可以使用jsoup这个我宁愿看看casperjs或phantomjs
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class getFaceBookLinks {
public static Elements getElementsByTag_then_FilterBySelector (String tag, String httplink, String selector){
Document doc = null;
try {
doc = Jsoup.connect(httplink).get();
} catch (IOException e) {
e.printStackTrace();
}
Elements links = doc.getElementsByTag(tag);
return links.select(selector);
}
//Test functionality
public static void main(String[] args){
// The class name for the like links on facebook is UFILikeLink
Elements likeLinks = getElementsByTag_then_FilterBySelector("a", "http://www.facebook.com", ".UFILikeLink");
System.out.println(likeLinks);
}
}