我创建了一个从网站上抓取数据的网络抓取工具。问题是,从该网站我们可以看到当天的数据以及整个财政年度的数据。我的刮刀所做的只是获取当天的数据。我无法提供全年的数据。例如,如果我想检索'01 -July-2015'的数据直到今天,那么我的刮刀只能获取当前数据的数据。
链接:http://www.nccpl.com.pk/market-information/fipi-lipi/fipi
下面是具有要获取的数据的屏幕。
以下是我的代码
package nccpl_fipi_yearly;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Nccpl_fipi_yearly {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
static int i = 0;
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.202");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.nccpl.com.pk/market-information/fipi-lipi/fipi";
doc = Jsoup.connect(tempUrl).timeout(10000).get();
System.out.println("Successfully Connected");
}
public static void parsingHTML() throws Exception {
File fold = new File("D:\\KSE\\NCCPL-YEARLY.csv");
fold.delete();
File fnew = new File("D:\\KSE\\NCCPL-YEARLY.csv");
for (Element table : doc.getElementsByClass("table")) {
for (Element trElement : table.getElementsByTag("tr")) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter(fnew, true);
//if (table.hasClass("marketData")) { //&&(tdElements.hasClass("tableHead")&&tdElements.hasClass("tableSubHead"))
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()&& i>0) {
///sb.append(" | ");
sb.append(" \r\n ");
}
for (Iterator<Element> it2 = tdElements.iterator(); it.hasNext();) {
Element tdElement2 = it.next();
final String content = tdElement2.text().replace(",", "");
if (it2.hasNext()) {
sb.append(formatData(content));
sb.append(" | ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
i++;
}
System.out.println(sampleList.add(tdElements));
}
}
}
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM/dd hh:mm", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("MMM-dd hh:mm", Locale.US);
public static String formatData(String text) {
String tmp = null;
try {
Date d = FORMATTER_MMM_d_yyyy.parse(text);
tmp = FORMATTER_dd_MMM_yyyy.format(d);
} catch (ParseException pe) {
tmp = text;
}
return tmp;
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
答案 0 :(得分:2)
使用同一页面中的搜索表单编辑Jsoup的连接方式:
doc = Jsoup.connect(tempUrl)
.data("fromDate", "17/06/2015")
.data("toDate", "17/06/2016")
.data("type", "101")
.data("search", "search")
.timeout(10000)
.post();