假设,
我将30个部分中的 2590400 KB (约2.5 GB)文件拆分。
它将生成30个文件,大小为 86347 KB 这似乎是正确的,2590400/30 = 86346.66666667
现在,如果我再次合并所有部分(30),则会生成 3453873 KB 文件,该文件应为 2590410 KB 。
任何人都可以帮助我为什么会出现这种差异?我在下面的代码中使用合并和拆分文件。
SplitFile.java
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
/**
* @author vishal.zanzrukia
*
*/
public class SplitFile {
public static final String INPUT_FILE = "D:\\me\\projects\\input\\file\\path.txt";
public static final int NUMBER_OF_OUTPUT_FILES = 30;
public static final String FILE_SUFFIX = ".txt";
/**
* split file
*
* @throws Exception
*/
static void splitFile() throws Exception{
File inputFile = new File(INPUT_FILE + "_Splits");
inputFile.mkdir();
RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");
long sourceSize = raf.length();
long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;
int maxReadBufferSize = 8 * 1024; // 8KB
for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\split." + destIx + FILE_SUFFIX));
if (bytesPerSplit > maxReadBufferSize) {
long numReads = bytesPerSplit / maxReadBufferSize;
long numRemainingRead = bytesPerSplit % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
} else {
readWrite(raf, bw, bytesPerSplit);
}
bw.close();
}
if (remainingBytes > 0) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
readWrite(raf, bw, remainingBytes);
bw.close();
}
raf.close();
}
/**
* join file
*
* @throws Exception
*/
static void joinFiles() throws Exception{
int maxReadBufferSize = 8 * 1024;
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX));
File inputFileDir = new File(INPUT_FILE + "_Splits");
RandomAccessFile raf = null;
if(inputFileDir.isDirectory()){
for(File file : inputFileDir.listFiles()){
raf = new RandomAccessFile(file, "r");
long numReads = raf.length() / maxReadBufferSize;
long numRemainingRead = raf.length() % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
raf.close();
}
}
bw.close();
}
public static void mergeFiles() {
File[] files = new File[NUMBER_OF_OUTPUT_FILES];
for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
files[i-1] = new File(INPUT_FILE + "_Splits\\split."+i+FILE_SUFFIX);
}
String mergedFilePath = INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX;
File mergedFile = new File(mergedFilePath);
mergeFiles(files, mergedFile);
}
public static void mergeFiles(File[] files, File mergedFile) {
FileWriter fstream = null;
BufferedWriter out = null;
try {
fstream = new FileWriter(mergedFile, true);
out = new BufferedWriter(fstream);
} catch (IOException e1) {
e1.printStackTrace();
}
for (File f : files) {
System.out.println("merging: " + f.getName());
FileInputStream fis;
try {
fis = new FileInputStream(f);
BufferedReader in = new BufferedReader(new InputStreamReader(fis));
String aLine;
while ((aLine = in.readLine()) != null) {
out.write(aLine);
out.newLine();
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
// splitFile();
mergeFiles();
}
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if (val != -1) {
bw.write(buf);
}
}
}
答案 0 :(得分:4)
使用您的joinFiles
方法:如果您希望保持原样,请不要尝试使用Reader
逐行读取文件,因为行结尾可能会有所不同平台。
而是使用InputStream
或RandomAccessFile
将其作为二进制文件读取,并使用OutputStream
进行写入。
joinFiles
方法中唯一的问题是它使用了File.listFiles()
,它无法保证文件的返回顺序。
我将您的mergeFiles()
代码与joinFiles()
合并以完成此工作(请记住从joinFiles()
方法调用mergeFiles()
而不是main
)
static void joinFiles(File[] files) throws Exception {
int maxReadBufferSize = 8 * 1024;
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin"
+ FILE_SUFFIX));
RandomAccessFile raf = null;
for (File file : files) {
raf = new RandomAccessFile(file, "r");
long numReads = raf.length() / maxReadBufferSize;
long numRemainingRead = raf.length() % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
raf.close();
}
bw.close();
}
public static void joinFiles() throws Exception {
File[] files = new File[NUMBER_OF_OUTPUT_FILES];
for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
files[i - 1] = new File(INPUT_FILE + "_Splits\\split." + i + FILE_SUFFIX);
}
joinFiles(files);
}
答案 1 :(得分:1)
问题是最后一行代码:
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if (val != -1) {
bw.write(buf);
}
}
写信时,您回写numBytes
数据,但read
函数有usefully returned:
读入缓冲区的总字节数,如果由于已到达此文件末尾而没有其他数据,则返回-1。
因此,您的修正是use a different write
:
bw.write(buf, 0 val);