从Google Scholar下载参考文献列表

时间:2017-06-13 11:19:15

标签: java web-scraping google-scholar

我已将研究论文中的参考文献存储在列表中(如下所示):我想从谷歌学者那里下载它们。我已经通过下面的网址成功下载了一篇论文,我现在需要做的是,因为我已经将列表中的研究论文的所有参考文献存储起来(列表中有15个参考文献,这意味着其中至少有5个参考文献pdf),我想在Google学术搜索上运行列表并下载可用的pdf参考文献。如果pdf不能用于参考,它必须显示" Pdf不可用":我已共享代码下载单个,我不知道如何修改列表的代码以下载多篇论文

 public static void main(String[] args) throws IOException {
               Scanner s = new Scanner(new File("D:\\ref.txt"));

ArrayList<String> list = new ArrayList<String>();
while (s.hasNextLine()){
        for (String Z : list)

//System.out.println("LISTZ:" +list);
s.close();//LIST completed

        //code to download the paper from scholar
try {
   //var a= doc.replace(" ","+");

    Document doc = Jsoup
            .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")

    String title = doc.title();
    System.out.println("title : " + title);

    Elements links = doc.select("div.gs_ggsd").select("a[href]");
    //Element  = doc.select("div.gs_ggs gs_fl").first();

    for (Element link : links) {
        //System.out.println("\nlink : " + link.attr("href"));
        URL website = new URL(link.attr("href"));
ReadableByteChannel rbc = Channels.newChannel(website.openStream());
FileOutputStream fos = new FileOutputStream("D:\\paper.pdf");
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
       // System.out.println("text : " + link.text());
  /* ByteArrayOutputStream href = new ByteArrayOutputStream();
PrintStream PS = new PrintStream(href);
PrintStream old = System.out;
System.out.println("Here: " + href.toString());*/

    catch (IOException e) {



import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.ArrayList;
import java.util.Scanner;

 * User: lihongxu
 * Date: 17/6/13
 * Time: 19:42
 * Comments
public class Test {

    public static void main(String[] args) throws IOException {
        Scanner s = new Scanner(new File("D:\\ref.txt"));

        ArrayList<String> list = new ArrayList<String>();
        while (s.hasNextLine()) {
                for (String Z : list) {

        s.close();// LIST completed

        // code to download the paper from scholar
        for (String query : list) {
            try {
                Document doc = Jsoup
                        .connect("https://scholar.google.com.pk/scholar?q=" + query)
                        .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like " +
                                "Gecko)" +
                                " Chrome/33.0.1750.152 Safari/537.36")

                String title = doc.title();
                System.out.println("title : " + title);

                Elements links = doc.select("div.gs_ggsd").select("a[href]");
                //Element  = doc.select("div.gs_ggs gs_fl").first();

                for (Element link : links) {
                    //System.out.println("\nlink : " + link.attr("href"));
                    URL website = new URL(link.attr("href"));
                    ReadableByteChannel rbc = Channels.newChannel(website.openStream());
                    FileOutputStream fos = new FileOutputStream("D:\\paper.pdf");
                    fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
                // System.out.println("text : " + link.text());
            } catch (IOException e) {