在Java中创建.CSV文件时,日期格式受到干扰

时间:2016-04-18 10:53:32

标签: java csv web-scraping web-crawler jsoup

我正在创建一个Web scraper,然后将数据存储在.CSV文件中。 我的程序运行正常但是,我检索数据的网站存在一个问题,即(Month Day, Year)格式的日期。因此,当我将数据保存在.CSV文件中时,它会将Year视为另一列,因为所有数据都会被操纵。我实际上想将这些数据存储到(MM-MON-YYYY)并将有效期存储在一列中。我在下面发布我的代码。请帮助我。谢谢!

P.S:我很抱歉没有在原帖中写出我想要的格式。

package com.mufapscraping;

//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {

    boolean writeCSVToConsole = true;
    boolean writeCSVToFile = true;
    //String destinationCSVFile = "C:\\convertedCSV.csv";
    boolean sortTheList = true;
    boolean writeToConsole;
    boolean writeToFile;
    public static Document doc = null;
    public static Elements tbodyElements = null;
    public static Elements elements = null;
    public static Elements tdElements = null;
    public static Elements trElement2 = null;
    public static String Dcomma = ", 2";
    public static ArrayList<Elements> sampleList = new ArrayList<Elements>();

    public static void createConnection() throws IOException {
        System.setProperty("http.proxyHost", "191.1.1.123");
        System.setProperty("http.proxyPort", "8080");
        String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
        doc = Jsoup.connect(tempUrl).get();
    }

    public static void parsingHTML() throws Exception {
        for (int i = 1; i <= 1; i++) {

            tbodyElements = doc.getElementsByTag("tbody");
            //Element table = doc.getElementById("dataTable");

            if (tbodyElements.isEmpty()) {
                throw new Exception("Table is not found");
            }
            elements = tbodyElements.get(0).getElementsByTag("tr");

            for (Element trElement : elements) {
                trElement2 = trElement.getElementsByTag("tr");
                tdElements = trElement.getElementsByTag("td");
                FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
                for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
                    if (it.hasNext()) {
                        sb.append("  \n  ");
                    }
                    for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
                        Element tdElement = it.next();
                        sb.append(tdElement.text());
                        if (it2.hasNext()) {
                            sb.append("   ,   ");
                        }

                    }

                    System.out.println(sb.toString());
                    sb.flush();
                    sb.close();
                }

                System.out.println(sampleList.add(tdElements));
                /* for (Elements elements2 : zakazky) {
                System.out.println(elements2);
            }*/

            }
        }
    }

    public static void main(String[] args) throws IOException, Exception {
        createConnection();
        parsingHTML();

    }

}

enter image description here

4 个答案:

答案 0 :(得分:1)

您可以多次使用方法getElementsByTag,而不是使用方法getElementsByTag,这可以更容易,并使您能够在几行代码中获得相同的输出

public static void main (String []args) throws IOException{
    String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
    Document doc = Jsoup.connect(tempUrl).get();

    Elements trElements = doc.select("#dataTable tbody tr");
    FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
    for(Element tr : trElements){
        Elements tdElements = tr.select("td");
        for (Element td : tdElements){
        sb.append(td.text());
        sb.append(";");
        }
        sb.append("\n");
    }
}

答案 1 :(得分:1)

不是直接在FileWriter中直接添加元素文本,而是首先对其进行格式化,然后附加它。

因此,请替换以下行:

sb.append(tdElement.text());

sb.append(formatData(tdElement.text()));
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM d, yyyy", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("dd-MMM-YYYY", Locale.US);

public static String formatData(String text) {
    String tmp = null;

    try {
        Date d = FORMATTER_MMM_d_yyyy.parse(text);
        tmp = FORMATTER_dd_MMM_yyyy.format(d);
    } catch (ParseException pe) {
        tmp = text;
    }

    return tmp;
}

样品

public static void main(String[] args) {
    String[] fields = new String[] { //
            "ABL Cash Fund", //
            "AA(f)", //
            "Apr 18, 2016", //
            "10.4729" //
    };

    for (String field : fields) {
        System.out.format("%s\n%s\n\n", field, formatData(field));
    }
}

输出

ABL Cash Fund
ABL Cash Fund

AA(f)
AA(f)

Apr 18, 2016
18-Apr-2016

10.4729
10.4729

答案 2 :(得分:0)

这可以通过简单地用双引号括起您的数据来实现,因此month day, year将成为"month day, year"。这是为您完成工作的修改后的代码:

package com.mufapscraping;

//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {

    boolean writeCSVToConsole = true;
    boolean writeCSVToFile = true;
    //String destinationCSVFile = "C:\\convertedCSV.csv";
    boolean sortTheList = true;
    boolean writeToConsole;
    boolean writeToFile;
    public static Document doc = null;
    public static Elements tbodyElements = null;
    public static Elements elements = null;
    public static Elements tdElements = null;
    public static Elements trElement2 = null;
    public static String Dcomma = ", 2";
    public static ArrayList<Elements> sampleList = new ArrayList<Elements>();

    public static void createConnection() throws IOException {
        System.setProperty("http.proxyHost", "191.1.1.123");
        System.setProperty("http.proxyPort", "8080");
        String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
        doc = Jsoup.connect(tempUrl).get();
    }

    public static void parsingHTML() throws Exception {
        for (int i = 1; i <= 1; i++) {

            tbodyElements = doc.getElementsByTag("tbody");
            //Element table = doc.getElementById("dataTable");

            if (tbodyElements.isEmpty()) {
                throw new Exception("Table is not found");
            }
            elements = tbodyElements.get(0).getElementsByTag("tr");

            for (Element trElement : elements) {
                trElement2 = trElement.getElementsByTag("tr");
                tdElements = trElement.getElementsByTag("td");
                FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
                for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
                    if (it.hasNext()) {
                        sb.append("  \n  ");
                    }
                    for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
                        Element tdElement = it.next();
                        sb.append('\"'); // surround your data
                        sb.append(tdElement.text());
                        sb.append('\"'); // with double quotes
                        if (it2.hasNext()) {

                            sb.append("   ,   ");
                        }

                    }

                    System.out.println(sb.toString());
                    sb.flush();
                    sb.close();
                }

                System.out.println(sampleList.add(tdElements));
                /* for (Elements elements2 : zakazky) {
                System.out.println(elements2);
            }*/

            }
        }
    }

    public static void main(String[] args) throws IOException, Exception {
        createConnection();
        parsingHTML();

    }

}

答案 3 :(得分:0)

然后你想分开它。好的,然后通过添加“year”列修改第一行:

Element tdElement = it.next();
final String content = tdElement.text()
sb.append(content);
if (it2.hasNext()) {
    sb.append("   ,   ");
if (content.equals("Validity Date"))
    sb.append("Validity Year,");

你可能想在之后打破?或者你将覆盖文件elements.size() - 1次......

FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) { ... }
break;