我的要求是将一个大文件(包含数百万条记录)拆分为100K记录文件,并将它们分发到3个文件夹。我使用Java来分割文件和spring集成以分发它们。在此过程中,将文件分发到文件夹需要很长时间。已添加以下配置文件。弹簧集成桥的使用是否正确用于此目的还是有任何优化方式?我需要增强此代码,以便将来读取和写入文件到s3存储桶而不是本地目录。 s3入站/出站适配器是否支持相同的目的?
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:integration="http://www.springframework.org/schema/integration"
xmlns:file="http://www.springframework.org/schema/integration/file"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/integration
http://www.springframework.org/schema/integration/spring-integration.xsd
http://www.springframework.org/schema/integration/file
http://www.springframework.org/schema/integration/file/spring-integration-file.xsd">
<file:inbound-channel-adapter id="filesIn1"
directory="/tmp/test">
<integration:poller id="poller" fixed-delay="5000" />
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut1"
directory="/tmp/output" delete-source-files="true" />
<integration:service-activator
input-channel="filesIn1" output-channel="filesOut1" ref="handler" />
<bean id="handler" class="com.test.Handler" />
<file:inbound-channel-adapter id="filesIn2"
directory="/tmp/output" filename-pattern="stack1*">
<integration:poller id="poller1" fixed-delay="5000" />
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut2"
directory="/tmp/stack1" delete-source-files="true" />
<integration:bridge input-channel="filesIn2"
output-channel="filesOut2" />
<file:inbound-channel-adapter id="filesIn3"
directory="/tmp/output" filename-pattern="stack2*">
<integration:poller id="poller2" fixed-delay="5000"/>
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut3"
directory="/tmp/stack2" delete-source-files="true" />
<integration:bridge input-channel="filesIn3"
output-channel="filesOut3" />
<file:inbound-channel-adapter id="filesIn4"
directory="/tmp/output" filename-pattern="stack3*">
<integration:poller id="poller3" fixed-delay="5000" />
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut4"
directory="/tmp/stack3" delete-source-files="true" />
<integration:bridge input-channel="filesIn4"
output-channel="filesOut4" />
Handler.java
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class Handler {
public void handleFile(File input) {
System.out.println("Copying file: " + input.getAbsolutePath());
try {
Path p = Paths.get(input.getAbsolutePath());
List<String> lines = Files.readAllLines(p);
int count = lines.size();
System.out.println("Lines in the file: " + count);
lines.size();// Source File Name.
Long nol = 100000L; // No. of lines to be split and saved in each
int stackcount = 0;
Long temp = (count / nol);
Long temp1 = (Long) temp;
Long nof = 0L;
if (temp1 == temp) {
nof = temp1;
}
else {
nof = temp1 + 1;
}
System.out.println("No. of files to be generated :" + nof); // Displays
// no.
// of
// files
// to be
// generated.
// ---------------------------------------------------------------------------------------------------------
// Actual splitting of file into smaller files
FileInputStream fstream = new FileInputStream(input.getAbsolutePath());
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine;
for (int j = 1; j <= nof; j++) {
if (stackcount < 3) {
stackcount = stackcount + 1;
}
else {
stackcount = 1;
}
FileWriter fstream1 = new FileWriter("/tmp/output/stack" + stackcount + "-" + j + ".dat"); // Destination
// File
// Location
BufferedWriter out = new BufferedWriter(fstream1);
for (int i = 1; i <= nol; i++) {
strLine = br.readLine();
if (strLine != null) {
out.write(strLine);
if (i != nol) {
out.newLine();
}
}
}
out.close();
}
in.close();
}
catch (Exception e) {
System.err.println("Error: " + e.getMessage());
}
}
}
答案 0 :(得分:0)
正如Andremoniy在评论中提到的那样,使用Files.readAllLines
并不是你真正想要的东西 - 大概,你的记忆力不能同时保持这么多。
相反,为什么不尝试这个:
try(Stream<String> allLines = Files.lines(path)) {
Iterator<String> reader = allLines.iterator();
int splitBatch = 10000; // however much you need.
int lineCount = 0;
int batchNumber = 1;
FileWriter out = getBatchOut(batchNumber); // replace this with what you need.
while(reader.hasNext() && lineCount < splitBatch) {
if (lineCount == splitBatch) {
out.flush(); out.close();
out = getBatchOut(++batchNumber); // next batch
lineCount = 0;
}
out.write(reader.next());
lineCount++;
}
}
注意我的示例代码中没有包含任何异常处理。您应该始终记得在遇到异常时释放所有资源。在我的示例中,必须始终关闭out
writer以不引入内存泄漏。我会告诉你如何正确地做到这一点。
答案 1 :(得分:0)
弹簧集成桥的使用是否正确用于此目的还是有任何优化方式?
你可以这样走。没关系。但是,如果您在入站和出站渠道适配器上切换到channel
属性,则不需要<bridge>
s3入站/出站适配器是否支持相同的目的?
正确。它们实际上提供了类似的功能,但适用于AWS S3协议。
注意:您的配置不清晰,看起来与问题无关...