CSVReader的readNext()函数不循环遍历csv的所有行[编辑:如何处理错误的CSV(删除未转义的引号)]

时间:2015-12-17 05:56:16

标签: java csv error-handling opencsv

FileReader fr = new FileReader(inp);
CSVReader reader = new CSVReader(fr, ',', '"');

// writer
File writtenFromWhile = new File(dliRootPath + writtenFromWhilePath);
writtenFromWhile.createNewFile();
CSVWriter writeFromWhile = new CSVWriter(new FileWriter(writtenFromWhile), ',', '"');

int insideWhile = 0;
String[] currRow = null;
while ((currRow = reader.readNext()) != null) {
    insideWhile++;
    writeFromWhile.writeNext(currRow);
}
System.out.println("inside While: " + insideWhile);
System.out.println("lines read (acc.to CSV reader): " + reader.getLinesRead());

输出结果为:

inside While: 162199
lines read (acc.to CSV reader): 256865

即使所有行都写入输出CSV(在文本编辑器中查看时,Excel显示的行数少得多),while循环的迭代次数与输入CSV中的行次数不同。我的主要目标是在每行的while循环中实现一些其他逻辑。 我一直试图调试两天(更大的代码)没有任何结果。

请解释我如何循环while 256865次

参考数据,完整图片

Here是我在上面的代码段中阅读的CSV。

我的完整程序尝试根据字段标题和作者(即如果作者和标题在2条记录中是相同的,甚至是this CSV中不存在的那些记录来分离this CSV如果其他字段不同,则将它们视为重复,不应写入输出文件)。这是我的完整代码(差异应该在300000左右,但我的代码输出文件只有~210000):

//TODO ask id
/*(*
 * id also there in fields getting matched (thisRow[0] is id)
 * u can replace it by thisRow[fielAnd Column.get(0)] to eliminate  id
 */

package mainOne;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.opencsv.CSVReader;
import com.opencsv.CSVWriter;

public class Diff_V3 {
    static String dliRootPath = "/home/gurnoor/Incoming/Untitled Folder 2/";
    static String dli = "new-dli-IITG.csv";
    static String oldDli = "dli-iisc.csv";
    static String newFile = "newSampleFile.csv";// not used
    static String unqFile = "UniqueFileFinal.csv";
    static String log = "Diff_V3_log.txt";
    static String splittedNewDliDir = "/home/gurnoor/Incoming/Untitled Folder 2/splitted new file";
    static String splittedOldDliDir = "/home/gurnoor/Incoming/Untitled Folder 2/splitted old file";

    // debug
    static String testFilePath = "testFile.csv";
    static int insidepopulateMapFromSplittedCSV = 0;

    public static void main(String[] args) throws IOException, CustomException {
        // _readSample(dliRootPath+dli, dliRootPath+newFile);
        // System.out.println(areIDsunique(dliRootPath + dli, 550841) );// open
        // in geany to get total no
        // of lines

        // TODO implement sparate function to check equals
        // File filteredFile = new File(dliRootPath + "filteredFile.csv");
        // filteredFile.createNewFile();
        File logFile = new File(dliRootPath + log);
        logFile.createNewFile();
        new File(dliRootPath + testFilePath).createNewFile();
        List<String> fieldsToBeMatched = new ArrayList<>();
        fieldsToBeMatched.add("dc.contributor.author[]");
        fieldsToBeMatched.add("dc.title[]");
        filterUniqueFileds(new File(splittedNewDliDir), new File(splittedOldDliDir), fieldsToBeMatched);

    }

    /**
     * NOTE: might remove the row where fieldToBeMatched is null
     * 
     * @param inpfile
     * @param file
     * @param filteredFile
     * @param fieldsToBeMatched
     * @throws IOException
     * @throws CustomException
     */
    private static void filterUniqueFileds(File newDir, File oldDir, List<String> fieldsToBeMatched)
            throws IOException, CustomException {

        CSVReader reader = new CSVReader(new FileReader(new File(dliRootPath + dli)), '|');
        // writer
        File unqFileOp = new File(dliRootPath + unqFile);
        unqFileOp.createNewFile();
        CSVWriter writer = new CSVWriter(new FileWriter(unqFileOp), '|');

        // logWriter
        BufferedWriter logWriter = new BufferedWriter(new FileWriter(new File(dliRootPath + log)));

        String[] headingRow = // allRows.get(0);
        reader.readNext();
        writer.writeNext(headingRow);
        int headingLen = headingRow.length;

        // old List
        System.out.println("[INFO] reading old list...");
        // CSVReader oldReader = new CSVReader(new FileReader(new
        // File(dliRootPath + oldDli)));
        Map<String, List<String>> oldMap = new HashMap<>();
        oldMap = populateMapFromSplittedCSV(oldMap, oldDir);// populateMapFromCSV(oldMap,
                                                            // oldReader);
        // oldReader.close();
        System.out.println("[INFO] Read old List. Size = " + oldMap.size());
        printMapToCSV(oldMap, dliRootPath + testFilePath);

        // map of fieldName, ColumnNo
        Map<String, Integer> fieldAndColumnNoInNew = new HashMap<>(getColumnNo(fieldsToBeMatched, headingRow));
        Map<String, Integer> fieldAndColumnNoInOld = new HashMap<>(
                getColumnNo(fieldsToBeMatched, (String[]) oldMap.get("id").toArray()));
        // error check: did columnNo get populated?
        if (fieldAndColumnNoInNew.isEmpty()) {
            reader.close();
            writer.close();
            throw new CustomException("field to be matched not present in input CSV");
        }

        // TODO implement own array compare using areEqual()
        // error check
        // if( !Arrays.equals(headingRow, (String[]) oldMap.get("id").toArray())
        // ){
        // System.out.println("heading in new file, old file: \n"+
        // Arrays.toString(headingRow));
        // System.out.println(Arrays.toString((String[])
        // oldMap.get("id").toArray()));
        // reader.close();
        // writer.close();
        // oldReader.close();
        // throw new CustomException("Heading rows are not same in old and new
        // file");
        // }

        int noOfRecordsInOldList = 0, noOfRecordsWritten = 0, checkManually = 0;
        String[] thisRow;
        while ((thisRow = reader.readNext()) != null) {
            // for(int l=allRows.size()-1; l>=0; l--){
            // thisRow=allRows.get(l);

            // error check
            if (thisRow.length != headingLen) {
                String error = "Line no: " + reader.getLinesRead() + " in file: " + dliRootPath + dli
                        + " not read. Check manually";

                System.err.println(error);
                logWriter.append(error + "\n");
                logWriter.flush();
                checkManually++;
                continue;
            }

            // write if not present in oldMap
            if (!oldMap.containsKey(thisRow[0])) {
                writer.writeNext(thisRow);
                writer.flush();
                noOfRecordsWritten++;
            } else {
                // check if all reqd fields match
                List<String> twinRow = oldMap.get(thisRow[0]);
                boolean writtenToOp = false;
                // for (int k = 0; k < fieldsToBeMatched.size(); k++) {
                List<String> newFields = new ArrayList<>(fieldAndColumnNoInNew.keySet());
                List<String> oldFields = new ArrayList<>(fieldAndColumnNoInOld.keySet());
                // faaltu error check
                if (newFields.size() != oldFields.size()) {
                    reader.close();
                    writer.close();
                    CustomException up = new CustomException("something is really wrong");
                    throw up;
                }
                // for(String fieldName : fieldAndColumnNoInNew.keySet()){
                for (int m = 0; m < newFields.size(); m++) {
                    int columnInNew = fieldAndColumnNoInNew.get(newFields.get(m)).intValue();
                    int columnInOld = fieldAndColumnNoInOld.get(oldFields.get(m)).intValue();
                    String currFieldTwin = twinRow.get(columnInOld);
                    String currField = thisRow[columnInNew];
                    if (!areEqual(currField, currFieldTwin)) {
                        writer.writeNext(thisRow);
                        writer.flush();
                        writtenToOp = true;
                        noOfRecordsWritten++;
                        System.out.println(noOfRecordsWritten);
                        break;
                    }
                }
                if (!writtenToOp) {
                    noOfRecordsInOldList++;
                    // System.out.println("[INFO] present in old List: \n" +
                    // Arrays.toString(thisRow) + " AND\n"
                    // + twinRow.toString());
                }
            }
        }
        System.out.println("--------------------------------------------------------\nDebug info");
        System.out.println("old File: " + oldMap.size());
        System.out.println("new File:" + reader.getLinesRead());

        System.out.println("no of records in old list (present in both old and new) = " + noOfRecordsInOldList);
        System.out.println("checkManually: " + checkManually);
        System.out.println("noOfRecordsInOldList+checkManually = " + (noOfRecordsInOldList + checkManually));
        System.out.println("no of records written = " + noOfRecordsWritten);
        System.out.println();
        System.out.println("inside populateMapFromSplittedCSV() " + insidepopulateMapFromSplittedCSV + "times");

        logWriter.close();
        reader.close();
        writer.close();

    }

    private static void printMapToCSV(Map<String, List<String>> oldMap, String testFilePath2) throws IOException {
        // writer
        int i = 0;
        CSVWriter writer = new CSVWriter(new FileWriter(new File(testFilePath2)), '|');
        for (String key : oldMap.keySet()) {
            List<String> row = oldMap.get(key);
            String[] tempRow = new String[row.size()];
            tempRow = row.toArray(tempRow);
            writer.writeNext(tempRow);
            writer.flush();
            i++;
        }
        writer.close();
        System.out.println("[hello from line 210 ( inside printMapToCSV() ) of ur code] wrote " + i + " lines");
    }

    private static Map<String, List<String>> populateMapFromSplittedCSV(Map<String, List<String>> oldMap, File oldDir)
            throws IOException {

        File defective = new File(dliRootPath + "defectiveOldFiles.csv");
        defective.createNewFile();
        CSVWriter defectWriter = new CSVWriter(new FileWriter(defective));

        CSVReader reader = null;
        for (File oldFile : oldDir.listFiles()) {
            insidepopulateMapFromSplittedCSV++;
            reader = new CSVReader(new FileReader(oldFile), ',', '"');
            oldMap = populateMapFromCSV(oldMap, reader, defectWriter);
            // printMapToCSV(oldMap, dliRootPath+testFilePath);
            System.out.println(oldMap.size());
            reader.close();
        }
        defectWriter.close();
        System.out.println("inside populateMapFromSplittedCSV() " + insidepopulateMapFromSplittedCSV + "times");
        return new HashMap<String, List<String>>(oldMap);
    }

    private static Map<String, Integer> getColumnNo(List<String> fieldsToBeMatched, String[] headingRow) {
        Map<String, Integer> fieldAndColumnNo = new HashMap<>();
        for (String field : fieldsToBeMatched) {
            for (int i = 0; i < headingRow.length; i++) {
                String heading = headingRow[i];
                if (areEqual(field, heading)) {
                    fieldAndColumnNo.put(field, Integer.valueOf(i));
                    break;
                }
            }
        }
        return fieldAndColumnNo;
    }

    private static Map<String, List<String>> populateMapFromCSV(Map<String, List<String>> oldMap, CSVReader oldReader,
            CSVWriter defectWriter) throws IOException {
        int headingLen = 0;
        List<String> headingRow = null;
        if (oldReader.getLinesRead() > 1) {
            headingRow = oldMap.get("id");
            headingLen = headingRow.size();
        }
        String[] thisRow;
        int insideWhile = 0, addedInMap = 0, doesNotContainKey = 0, containsKey = 0;
        while ((thisRow = oldReader.readNext()) != null) {

            // error check
            // if (oldReader.getLinesRead() > 1) {
            // if (thisRow.length != headingLen) {
            // System.err.println("Line no: " + oldReader.getLinesRead() + " in
            // file: " + dliRootPath + oldDli
            // + " not read. Check manually");
            // defectWriter.writeNext(thisRow);
            // defectWriter.flush();
            // continue;
            // }
            // }

            insideWhile++;
            if (!oldMap.containsKey(thisRow[0])) {
                doesNotContainKey++;
                List<String> fullRow = Arrays.asList(thisRow);
                fullRow = oldMap.put(thisRow[0], fullRow);
                if (fullRow == null) {
                    addedInMap++;
                }
            } else {
                List<String> twinRow = oldMap.get(thisRow[0]);
                boolean writtenToOp = false;

                // for(String fieldName : fieldAndColumnNoInNew.keySet()){
                for (int m = 0; m < headingRow.size(); m++) {

                    String currFieldTwin = twinRow.get(m);
                    String currField = thisRow[m];
                    if (!areEqual(currField, currFieldTwin)) {
                        System.err.println("do something!!!!!!  DUPLICATE ID in old file");
                        containsKey++;
                        FileWriter logWriter = new FileWriter(new File((dliRootPath + log)));
                        System.err.println("[Skipped record] in old file. Row no: " + oldReader.getLinesRead()
                                + "\nRecord: " + Arrays.toString(thisRow));
                        logWriter.append("[Skipped record] in old file. Row no: " + oldReader.getLinesRead()
                                + "\nRecord: " + Arrays.toString(thisRow));
                        logWriter.close();
                        break;
                    }
                }

            }
        }
        System.out.println("inside while:      " + insideWhile);
        System.out.println("oldMap size =      " + oldMap.size());
        System.out.println("addedInMap:        " + addedInMap);
        System.out.println("doesNotContainKey: " + doesNotContainKey);
        System.out.println("containsKey:       " + containsKey);

        return new HashMap<String, List<String>>(oldMap);

    }

    private static boolean areEqual(String field, String heading) {
        // TODO implement, askSubhayan

        return field.trim().equals(heading.trim());
    }

    /**
     * Returns the first duplicate ID OR the string "unique" OR (rarely)
     * totalLinesInCSV != totaluniqueIDs
     * 
     * @param inpCSV
     * @param totalLinesInCSV
     * @return
     * @throws IOException
     */
    private static String areIDsunique(String inpCSV, int totalLinesInCSV) throws IOException {
        CSVReader reader = new CSVReader(new FileReader(new File(dliRootPath + dli)), '|');
        List<String[]> allRows = new ArrayList<>(reader.readAll());
        reader.close();
        Set<String> id = new HashSet<>();
        for (String[] thisRow : allRows) {
            if (thisRow[0] != null || !thisRow[0].isEmpty() || id.add(thisRow[0])) {
                return thisRow[0];
            }
        }
        if (id.size() == totalLinesInCSV) {
            return "unique";
        } else {
            return "totalLinesInCSV != totaluniqueIDs";
        }
    }

    /**
     * writes 20 rowsof input csv into the output file
     * 
     * @param input
     * @param output
     * @throws IOException
     */
    public static void _readSample(String input, String output) throws IOException {
        File opFile = new File(dliRootPath + newFile);
        opFile.createNewFile();
        CSVWriter writer = new CSVWriter(new FileWriter(opFile));

        CSVReader reader = new CSVReader(new FileReader(new File(dliRootPath + dli)), '|');
        for (int i = 0; i < 20; i++) {
            // String[] op;
            // for(String temp: reader.readNext()){
            writer.writeNext(reader.readNext());
            // }
            // System.out.println();
        }
        reader.close();
        writer.flush();
        writer.close();
    }

}

2 个答案:

答案 0 :(得分:0)

CSV单元格内未转义的引号可能会弄乱您的整个数据。如果您手动创建了正在使用的数据,则可能会在CSV中发生这种情况。下面是我为这种情况写了一段时间的函数。如果这不是分享它的正确位置,请告诉我。

/**
 * removes quotes inside a cell/column puts curated data in
 * "../CuratedFiles"
 * 
 * @param curateDir
 * @param del Csv column delimiter
 * @throws IOException
 */
public static void curateCsvRowQuotes(File curateDir, String del) throws IOException {
    File parent = curateDir.getParentFile();
    File curatedDir = new File(parent.getAbsolutePath() + "/CuratedFiles");
    curatedDir.mkdir();
    for (File file : curateDir.listFiles()) {
        BufferedReader bufRead = new BufferedReader(new FileReader(file));

        // output
        File fOp = new File(curatedDir.getAbsolutePath() + "/" + file.getName());
        fOp.createNewFile();
        BufferedWriter bufW = new BufferedWriter(new FileWriter(fOp));

        bufW.append(bufRead.readLine() + "\n");// heading

        // logs
        File logFile = new File(curatedDir.getAbsolutePath() + "/CurationLogs.txt");
        logFile.createNewFile();
        BufferedWriter logWriter = new BufferedWriter(new FileWriter(logFile));

        String thisLine = null;
        int lineCount = 0;
        while ((thisLine = bufRead.readLine()) != null) {

            String opLine = "";
            int endIndex = thisLine.indexOf("\"" + del);
            String str = thisLine.substring(0, endIndex);
            opLine += str + "\"" + del;
            while (endIndex != (-1)) {
                // leave out first " in a cell
                int tempIndex = thisLine.indexOf("\"" + del, endIndex + 2);
                if (tempIndex == (-1)) {
                    break;
                }
                str = thisLine.substring(endIndex + 2, tempIndex);
                int indexOfQuote = str.indexOf("\"");
                opLine += str.substring(0, indexOfQuote + 1);

                // remove all "
                str = str.substring(indexOfQuote + 1);
                str = str.replace("\"", "");
                opLine += str + "\"" + del;
                endIndex = thisLine.indexOf("\"" + del, endIndex + 2);
            }
            str = thisLine.substring(thisLine.lastIndexOf("\"" + del) + 2);
            if ((str != null) && str.matches("[" + del + "]+")) {
                opLine += str;
            }

            System.out.println(opLine);
            bufW.append(opLine + "\n");
            bufW.flush();
            lineCount++;

        }
        System.out.println(lineCount + " no of lines  in " + file.getName());
        bufRead.close();
        bufW.close();
    }
}

答案 1 :(得分:0)

就我而言,我在readNext()之前使用过csvReader.readAll()。

喜欢

Opcodes.Ldobj

所以我的csvReader.readNext()返回null。由于myData已经读取了所有值。

请注意使用readNext()和readAll()函数。