我正在创建一个Web scraper,然后将数据存储在.CSV文件中。
我的程序运行正常但是,我检索数据的网站存在一个问题,即(Month Day, Year)
格式的日期。因此,当我将数据保存在.CSV文件中时,它会将Year视为另一列,因为所有数据都会被操纵。我实际上想将这些数据存储到(MM-MON-YYYY)
并将有效期存储在一列中。我在下面发布我的代码。请帮助我。谢谢!
P.S:我很抱歉没有在原帖中写出我想要的格式。
package com.mufapscraping;
//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
//String destinationCSVFile = "C:\\convertedCSV.csv";
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ", 2";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.123");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
doc = Jsoup.connect(tempUrl).get();
}
public static void parsingHTML() throws Exception {
for (int i = 1; i <= 1; i++) {
tbodyElements = doc.getElementsByTag("tbody");
//Element table = doc.getElementById("dataTable");
if (tbodyElements.isEmpty()) {
throw new Exception("Table is not found");
}
elements = tbodyElements.get(0).getElementsByTag("tr");
for (Element trElement : elements) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append(" \n ");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement = it.next();
sb.append(tdElement.text());
if (it2.hasNext()) {
sb.append(" , ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
System.out.println(sampleList.add(tdElements));
/* for (Elements elements2 : zakazky) {
System.out.println(elements2);
}*/
}
}
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
答案 0 :(得分:1)
您可以多次使用方法getElementsByTag,而不是使用方法getElementsByTag,这可以更容易,并使您能够在几行代码中获得相同的输出
public static void main (String []args) throws IOException{
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
Document doc = Jsoup.connect(tempUrl).get();
Elements trElements = doc.select("#dataTable tbody tr");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for(Element tr : trElements){
Elements tdElements = tr.select("td");
for (Element td : tdElements){
sb.append(td.text());
sb.append(";");
}
sb.append("\n");
}
}
答案 1 :(得分:1)
不是直接在FileWriter
中直接添加元素文本,而是首先对其进行格式化,然后附加它。
因此,请替换以下行:
sb.append(tdElement.text());
到
sb.append(formatData(tdElement.text()));
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM d, yyyy", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("dd-MMM-YYYY", Locale.US);
public static String formatData(String text) {
String tmp = null;
try {
Date d = FORMATTER_MMM_d_yyyy.parse(text);
tmp = FORMATTER_dd_MMM_yyyy.format(d);
} catch (ParseException pe) {
tmp = text;
}
return tmp;
}
public static void main(String[] args) {
String[] fields = new String[] { //
"ABL Cash Fund", //
"AA(f)", //
"Apr 18, 2016", //
"10.4729" //
};
for (String field : fields) {
System.out.format("%s\n%s\n\n", field, formatData(field));
}
}
ABL Cash Fund
ABL Cash Fund
AA(f)
AA(f)
Apr 18, 2016
18-Apr-2016
10.4729
10.4729
答案 2 :(得分:0)
这可以通过简单地用双引号括起您的数据来实现,因此month day, year
将成为"month day, year"
。这是为您完成工作的修改后的代码:
package com.mufapscraping;
//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
//String destinationCSVFile = "C:\\convertedCSV.csv";
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ", 2";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.123");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
doc = Jsoup.connect(tempUrl).get();
}
public static void parsingHTML() throws Exception {
for (int i = 1; i <= 1; i++) {
tbodyElements = doc.getElementsByTag("tbody");
//Element table = doc.getElementById("dataTable");
if (tbodyElements.isEmpty()) {
throw new Exception("Table is not found");
}
elements = tbodyElements.get(0).getElementsByTag("tr");
for (Element trElement : elements) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append(" \n ");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement = it.next();
sb.append('\"'); // surround your data
sb.append(tdElement.text());
sb.append('\"'); // with double quotes
if (it2.hasNext()) {
sb.append(" , ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
System.out.println(sampleList.add(tdElements));
/* for (Elements elements2 : zakazky) {
System.out.println(elements2);
}*/
}
}
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
答案 3 :(得分:0)
然后你想分开它。好的,然后通过添加“year”列修改第一行:
Element tdElement = it.next();
final String content = tdElement.text()
sb.append(content);
if (it2.hasNext()) {
sb.append(" , ");
if (content.equals("Validity Date"))
sb.append("Validity Year,");
你可能想在之后打破?或者你将覆盖文件elements.size() - 1次......
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) { ... }
break;