如何根据日期刮取数据

时间:2016-06-18 08:54:48

标签: java web-scraping jsoup

我创建了一个从网站上抓取数据的网络抓取工具。问题是,从该网站我们可以看到当天的数据以及整个财政年度的数据。我的刮刀所做的只是获取当天的数据。我无法提供全年的数据。例如,如果我想检索'01 -July-2015'的数据直到今天,那么我的刮刀只能获取当前数据的数据。
链接:http://www.nccpl.com.pk/market-information/fipi-lipi/fipi 下面是具有要获取的数据的屏幕。 enter image description here 以下是我的代码

package nccpl_fipi_yearly;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Nccpl_fipi_yearly {
    boolean writeCSVToConsole = true;
    boolean writeCSVToFile = true;
    boolean sortTheList = true;
    boolean writeToConsole;
    boolean writeToFile;
    public static Document doc = null;
    public static Elements tbodyElements = null;
    public static Elements elements = null;
    public static Elements tdElements = null;
    public static Elements trElement2 = null;
    public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
    static int i = 0;

    public static void createConnection() throws IOException {
        System.setProperty("http.proxyHost", "191.1.1.202");
        System.setProperty("http.proxyPort", "8080");
        String tempUrl = "http://www.nccpl.com.pk/market-information/fipi-lipi/fipi";
        doc = Jsoup.connect(tempUrl).timeout(10000).get();
        System.out.println("Successfully Connected");
    }

    public static void parsingHTML() throws Exception {
        File fold = new File("D:\\KSE\\NCCPL-YEARLY.csv");
        fold.delete();
        File fnew = new File("D:\\KSE\\NCCPL-YEARLY.csv");
        for (Element table : doc.getElementsByClass("table")) {

            for (Element trElement : table.getElementsByTag("tr")) {

                trElement2 = trElement.getElementsByTag("tr");
                tdElements = trElement.getElementsByTag("td");
                FileWriter sb = new FileWriter(fnew, true);

                //if (table.hasClass("marketData")) { //&&(tdElements.hasClass("tableHead")&&tdElements.hasClass("tableSubHead"))
                for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
                    if (it.hasNext()&& i>0) {
                        ///sb.append(" | ");
                        sb.append(" \r\n ");
                    }

                    for (Iterator<Element> it2 = tdElements.iterator(); it.hasNext();) {
                        Element tdElement2 = it.next();
                        final String content = tdElement2.text().replace(",", "");
                        if (it2.hasNext()) {

                            sb.append(formatData(content));
                            sb.append("   |   ");

                        }

                    }

                    System.out.println(sb.toString());
                    sb.flush();
                    sb.close();
                    i++;
                }

                System.out.println(sampleList.add(tdElements));


            }

        }
    }
    private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM/dd hh:mm", Locale.US);
    private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("MMM-dd hh:mm", Locale.US);

    public static String formatData(String text) {
        String tmp = null;

        try {
            Date d = FORMATTER_MMM_d_yyyy.parse(text);
            tmp = FORMATTER_dd_MMM_yyyy.format(d);
        } catch (ParseException pe) {
            tmp = text;
        }

        return tmp;
    }

    public static void main(String[] args) throws IOException, Exception {
        createConnection();
        parsingHTML();

    }
}

1 个答案:

答案 0 :(得分:2)

使用同一页面中的搜索表单编辑Jsoup的连接方式:

doc = Jsoup.connect(tempUrl)
  .data("fromDate", "17/06/2015")
  .data("toDate", "17/06/2016")
  .data("type", "101")
  .data("search", "search")
  .timeout(10000)
  .post();