我有2个大的csv 5GB文件,假设FileA和FileB。我想以有效的方式比较2个文件和2列。每个文件有63列。我想要一些文件作为下面解释的输出:
FileA - FileB = File(Unmatched records in FileA)
FileB - FileA = File(Unmatched records in FileB)
FileA ∩ FileB = File(Matched records in both files)
我跟着解决这个问题的方法。我不知道它是否有效。代码片段非常有用。
编辑:这是我正在关注的流程。首先在两个不同的线程中将这两个文件分成75个块。
public class SplitFiles implements Runnable {
private final String sourcePath;
private final String destPath;
SplitFiles(String sourcePath, String destPath) {
this.sourcePath = sourcePath;
this.destPath = destPath;
}
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw,
long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if (val != -1) {
bw.write(buf);
}
}
@Override
public void run() {
try(RandomAccessFile raf = new RandomAccessFile(sourcePath, "r")) {
long numSplits = 75; // from user input, extract it from args
long sourceSize = raf.length();
long bytesPerSplit = sourceSize / numSplits;
long remainingBytes = sourceSize % numSplits;
int maxReadBufferSize = 64 * 1024; // 8KB
for (int destIx = 1; destIx <= numSplits; destIx++) {
BufferedOutputStream bw = new BufferedOutputStream(
new FileOutputStream(destPath + destIx));
if (bytesPerSplit > maxReadBufferSize) {
long numReads = bytesPerSplit / maxReadBufferSize;
long numRemainingRead = bytesPerSplit % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
} else {
readWrite(raf, bw, bytesPerSplit);
}
bw.close();
}
if (remainingBytes > 0) {
BufferedOutputStream bw = new BufferedOutputStream(
new FileOutputStream(destPath + numSplits + 1));
readWrite(raf, bw, remainingBytes);
bw.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
然后读取75个线程中的两个文件并进行比较。
公共类CompareFiles实现Runnable { private static String SOURCE_A_FILE_NAME =“SplitA / split。”; private static String SOURCE_B_FILE_NAME =“SplitB / split。”;
private final int countUntil;
CompareFiles(int countUntil) {
this.countUntil = countUntil;
}
@Override
public void run() {
try (BufferedReader inA = Files.newBufferedReader(
Paths.get(SOURCE_A_FILE_NAME + countUntil),
Charset.forName("UTF-8"))) {
for (String lineA; (lineA = inA.readLine()) != null;) {
try (BufferedReader inB = Files.newBufferedReader(
Paths.get(SOURCE_B_FILE_NAME + countUntil),
Charset.forName("UTF-8"))) {
String[] arrayA = lineA.split("\\|");
for (String lineB; (lineB = inB.readLine()) != null;) {
String[] arrayB = lineB.split("\\|");
if(arrayA[11].equals(arrayB[11]) && arrayA[13].equals(arrayB[13])){
writeMatchedRecords();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public synchronized void writeMatchedRecords(){
System.out.println("Write files for matched records");
}
}