用Java分割和合并大文件(以GB为单位)

时间:2015-07-02 08:24:36

标签: java file io merge split

假设,

  • 我将30个部分中的 2590400 KB (约2.5 GB)文件拆分。

  • 它将生成30个文件,大小为 86347 KB 这似乎是正确的,2590400/30 = 86346.66666667

  • 现在,如果我再次合并所有部分(30),则会生成 3453873 KB 文件,该文件应为 2590410 KB

任何人都可以帮助我为什么会出现这种差异?我在下面的代码中使用合并和拆分文件。

SplitFile.java

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;

/**
 * @author vishal.zanzrukia
 * 
 */
public class SplitFile {

    public static final String INPUT_FILE = "D:\\me\\projects\\input\\file\\path.txt";
    public static final int NUMBER_OF_OUTPUT_FILES = 30;
    public static final String FILE_SUFFIX = ".txt";

    /**
     * split file
     * 
     * @throws Exception
     */
    static void splitFile() throws Exception{

        File inputFile = new File(INPUT_FILE + "_Splits");
        inputFile.mkdir();

        RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");

        long sourceSize = raf.length();
        long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
        long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;

        int maxReadBufferSize = 8 * 1024; // 8KB
        for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\split." + destIx + FILE_SUFFIX));
            if (bytesPerSplit > maxReadBufferSize) {
                long numReads = bytesPerSplit / maxReadBufferSize;
                long numRemainingRead = bytesPerSplit % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
            } else {
                readWrite(raf, bw, bytesPerSplit);
            }
            bw.close();
        }
        if (remainingBytes > 0) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
            readWrite(raf, bw, remainingBytes);
            bw.close();
        }
        raf.close();
    }

    /**
     * join file
     * 
     * @throws Exception
     */
    static void joinFiles() throws Exception{
        int maxReadBufferSize = 8 * 1024; 

        BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX));
        File inputFileDir = new File(INPUT_FILE + "_Splits");
        RandomAccessFile raf = null;
        if(inputFileDir.isDirectory()){
            for(File file : inputFileDir.listFiles()){
                raf = new RandomAccessFile(file, "r");
                long numReads = raf.length() / maxReadBufferSize;
                long numRemainingRead = raf.length()  % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
                raf.close();
            }
        }
        bw.close();
    }

    public static void mergeFiles() {

        File[] files = new File[NUMBER_OF_OUTPUT_FILES];
        for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
            files[i-1] = new File(INPUT_FILE + "_Splits\\split."+i+FILE_SUFFIX);
        }

        String mergedFilePath = INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX;


        File mergedFile = new File(mergedFilePath);

        mergeFiles(files, mergedFile);
    }

    public static void mergeFiles(File[] files, File mergedFile) {

        FileWriter fstream = null;
        BufferedWriter out = null;
        try {
            fstream = new FileWriter(mergedFile, true);
             out = new BufferedWriter(fstream);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        for (File f : files) {
            System.out.println("merging: " + f.getName());
            FileInputStream fis;
            try {
                fis = new FileInputStream(f);
                BufferedReader in = new BufferedReader(new InputStreamReader(fis));

                String aLine;
                while ((aLine = in.readLine()) != null) {
                    out.write(aLine);
                    out.newLine();
                }

                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        try {
            out.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public static void main(String[] args) throws Exception {
//      splitFile();
        mergeFiles();
    }

    static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
        byte[] buf = new byte[(int) numBytes];
        int val = raf.read(buf);
        if (val != -1) {
            bw.write(buf);
        }
    }
}

2 个答案:

答案 0 :(得分:4)

使用您的joinFiles方法:如果您希望保持原样,请不要尝试使用Reader逐行读取文件,因为行结尾可能会有所不同平台。

而是使用InputStreamRandomAccessFile将其作为二进制文件读取,并使用OutputStream进行写入。

joinFiles方法中唯一的问题是它使用了File.listFiles(),它无法保证文件的返回顺序。

我将您的mergeFiles()代码与joinFiles()合并以完成此工作(请记住从joinFiles()方法调用mergeFiles()而不是main

static void joinFiles(File[] files) throws Exception {
    int maxReadBufferSize = 8 * 1024;

    BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin"
            + FILE_SUFFIX));

    RandomAccessFile raf = null;
    for (File file : files) {
        raf = new RandomAccessFile(file, "r");
        long numReads = raf.length() / maxReadBufferSize;
        long numRemainingRead = raf.length() % maxReadBufferSize;
        for (int i = 0; i < numReads; i++) {
            readWrite(raf, bw, maxReadBufferSize);
        }
        if (numRemainingRead > 0) {
            readWrite(raf, bw, numRemainingRead);
        }
        raf.close();

    }
    bw.close();
}

public static void joinFiles() throws Exception {

    File[] files = new File[NUMBER_OF_OUTPUT_FILES];
    for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
        files[i - 1] = new File(INPUT_FILE + "_Splits\\split." + i + FILE_SUFFIX);
    }

    joinFiles(files);
}

答案 1 :(得分:1)

问题是最后一行代码:

static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
    byte[] buf = new byte[(int) numBytes];
    int val = raf.read(buf);
    if (val != -1) {
        bw.write(buf);
    }
}

写信时,您回写numBytes数据,但read函数有usefully returned

  

读入缓冲区的总字节数,如果由于已到达此文件末尾而没有其他数据,则返回-1。

因此,您的修正是use a different write

bw.write(buf, 0 val);