在java中将2个大型排序的CSV文件比较为2列

时间:2014-09-30 09:02:21

标签: java csv

我有2个大的csv 5GB文件,假设FileA和FileB。我想以有效的方式比较2个文件和2列。每个文件有63列。我想要一些文件作为下面解释的输出:

FileA - FileB = File(Unmatched records in FileA)
FileB - FileA = File(Unmatched records in FileB)
FileA ∩ FileB = File(Matched records in both files)

我跟着解决这个问题的方法。我不知道它是否有效。代码片段非常有用。

编辑:这是我正在关注的流程。首先在两个不同的线程中将这两个文件分成75个块。

public class SplitFiles implements Runnable {

    private final String sourcePath;
    private final String destPath;

    SplitFiles(String sourcePath, String destPath) {
        this.sourcePath = sourcePath;
        this.destPath = destPath;
    }

    static void readWrite(RandomAccessFile raf, BufferedOutputStream bw,
            long numBytes) throws IOException {
        byte[] buf = new byte[(int) numBytes];
        int val = raf.read(buf);
        if (val != -1) {
            bw.write(buf);
        }
    }

    @Override
    public void run() {
        try(RandomAccessFile raf = new RandomAccessFile(sourcePath, "r")) {

            long numSplits = 75; // from user input, extract it from args
            long sourceSize = raf.length();
            long bytesPerSplit = sourceSize / numSplits;
            long remainingBytes = sourceSize % numSplits;

            int maxReadBufferSize = 64 * 1024; // 8KB

            for (int destIx = 1; destIx <= numSplits; destIx++) {
                BufferedOutputStream bw = new BufferedOutputStream(
                        new FileOutputStream(destPath + destIx));
                if (bytesPerSplit > maxReadBufferSize) {
                    long numReads = bytesPerSplit / maxReadBufferSize;
                    long numRemainingRead = bytesPerSplit % maxReadBufferSize;
                    for (int i = 0; i < numReads; i++) {
                        readWrite(raf, bw, maxReadBufferSize);
                    }
                    if (numRemainingRead > 0) {
                        readWrite(raf, bw, numRemainingRead);
                    }
                } else {
                    readWrite(raf, bw, bytesPerSplit);
                }
                bw.close();
            }
            if (remainingBytes > 0) {
                BufferedOutputStream bw = new BufferedOutputStream(
                        new FileOutputStream(destPath + numSplits + 1));
                readWrite(raf, bw, remainingBytes);
                bw.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
}
  1. 然后读取75个线程中的两个文件并进行比较。

    公共类CompareFiles实现Runnable {     private static String SOURCE_A_FILE_NAME =“SplitA / split。”;     private static String SOURCE_B_FILE_NAME =“SplitB / split。”;

    private final int countUntil;
    
    CompareFiles(int countUntil) {
        this.countUntil = countUntil;
    }
    
    @Override
    public void run() {
    
        try (BufferedReader inA = Files.newBufferedReader(
                Paths.get(SOURCE_A_FILE_NAME + countUntil),
                Charset.forName("UTF-8"))) {
            for (String lineA; (lineA = inA.readLine()) != null;) {
                try (BufferedReader inB = Files.newBufferedReader(
                        Paths.get(SOURCE_B_FILE_NAME + countUntil),
                        Charset.forName("UTF-8"))) {
                    String[] arrayA = lineA.split("\\|");
                    for (String lineB; (lineB = inB.readLine()) != null;) {
                        String[] arrayB = lineB.split("\\|");
                        if(arrayA[11].equals(arrayB[11]) && arrayA[13].equals(arrayB[13])){
                            writeMatchedRecords();
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
    
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    public synchronized void writeMatchedRecords(){
        System.out.println("Write files for matched records");
    }
    

    }

0 个答案:

没有答案