Question

我的要求是将一个大文件（包含数百万条记录）拆分为100K记录文件，并将它们分发到3个文件夹。我使用Java来分割文件和spring集成以分发它们。在此过程中，将文件分发到文件夹需要很长时间。已添加以下配置文件。弹簧集成桥的使用是否正确用于此目的还是有任何优化方式？我需要增强此代码，以便将来读取和写入文件到s3存储桶而不是本地目录。 s3入站/出站适配器是否支持相同的目的？

<?xml version="1.0" encoding="UTF-8"?>
 <beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:integration="http://www.springframework.org/schema/integration"
xmlns:file="http://www.springframework.org/schema/integration/file"
xsi:schemaLocation="http://www.springframework.org/schema/beans
        http://www.springframework.org/schema/beans/spring-beans.xsd
        http://www.springframework.org/schema/integration
        http://www.springframework.org/schema/integration/spring-integration.xsd
        http://www.springframework.org/schema/integration/file
        http://www.springframework.org/schema/integration/file/spring-integration-file.xsd">


<file:inbound-channel-adapter id="filesIn1"
    directory="/tmp/test">
    <integration:poller id="poller" fixed-delay="5000" />
</file:inbound-channel-adapter>

 <file:outbound-channel-adapter id="filesOut1"
    directory="/tmp/output" delete-source-files="true" />

<integration:service-activator
    input-channel="filesIn1" output-channel="filesOut1" ref="handler" />


<bean id="handler" class="com.test.Handler" />


<file:inbound-channel-adapter id="filesIn2"
    directory="/tmp/output" filename-pattern="stack1*">
    <integration:poller id="poller1" fixed-delay="5000" />
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut2"
    directory="/tmp/stack1" delete-source-files="true" />
<integration:bridge input-channel="filesIn2" 
        output-channel="filesOut2" />


<file:inbound-channel-adapter id="filesIn3"
    directory="/tmp/output" filename-pattern="stack2*">
    <integration:poller id="poller2"  fixed-delay="5000"/>
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut3"
    directory="/tmp/stack2" delete-source-files="true" />
<integration:bridge input-channel="filesIn3" 
        output-channel="filesOut3" />


        <file:inbound-channel-adapter id="filesIn4"
    directory="/tmp/output" filename-pattern="stack3*">
    <integration:poller id="poller3" fixed-delay="5000" />
</file:inbound-channel-adapter>
<file:outbound-channel-adapter id="filesOut4"
    directory="/tmp/stack3" delete-source-files="true" />
<integration:bridge input-channel="filesIn4" 
        output-channel="filesOut4" />

Handler.java

    import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

   public class Handler {
    public void handleFile(File input) {
        System.out.println("Copying file: " + input.getAbsolutePath());

        try {

            Path p = Paths.get(input.getAbsolutePath());
            List<String> lines = Files.readAllLines(p);

            int count = lines.size();
            System.out.println("Lines in the file: " + count);
            lines.size();// Source File Name.
            Long nol = 100000L; // No. of lines to be split and saved in each
            int stackcount = 0;

            Long temp = (count / nol);
            Long temp1 = (Long) temp;

            Long nof = 0L;
            if (temp1 == temp) {
                nof = temp1;
            }
            else {
                nof = temp1 + 1;
            }
            System.out.println("No. of files to be generated :" + nof); // Displays
                                                                        // no.
                                                                        // of
                                                                        // files
                                                                        // to be
                                                                        // generated.

            // ---------------------------------------------------------------------------------------------------------

            // Actual splitting of file into smaller files

            FileInputStream fstream = new FileInputStream(input.getAbsolutePath());
            DataInputStream in = new DataInputStream(fstream);

            BufferedReader br = new BufferedReader(new InputStreamReader(in));
            String strLine;

            for (int j = 1; j <= nof; j++) {

                if (stackcount < 3) {
                    stackcount = stackcount + 1;
                }
                else {
                    stackcount = 1;
                }

                FileWriter fstream1 = new FileWriter("/tmp/output/stack" + stackcount + "-" + j + ".dat"); // Destination
                                                                                                           // File
                                                                                                           // Location
                BufferedWriter out = new BufferedWriter(fstream1);

                for (int i = 1; i <= nol; i++) {
                    strLine = br.readLine();
                    if (strLine != null) {
                        out.write(strLine);
                        if (i != nol) {
                            out.newLine();
                        }
                    }
                }
                out.close();
            }
            in.close();

        }
        catch (Exception e) {
            System.err.println("Error: " + e.getMessage());
        }

    }

}

Answer 1

正如Andremoniy在评论中提到的那样，使用Files.readAllLines并不是你真正想要的东西 - 大概，你的记忆力不能同时保持这么多。

相反，为什么不尝试这个：

try(Stream<String> allLines = Files.lines(path)) {
  Iterator<String> reader = allLines.iterator();

  int splitBatch = 10000; // however much you need.
  int lineCount = 0;
  int batchNumber = 1;
  FileWriter out = getBatchOut(batchNumber); // replace this with what you need.
  while(reader.hasNext() && lineCount < splitBatch) {
    if (lineCount == splitBatch) {
      out.flush(); out.close();
      out = getBatchOut(++batchNumber); // next batch
      lineCount = 0;
    }
    out.write(reader.next());
    lineCount++;
  }
}

注意我的示例代码中没有包含任何异常处理。您应该始终记得在遇到异常时释放所有资源。在我的示例中，必须始终关闭out writer以不引入内存泄漏。我会告诉你如何正确地做到这一点。

Answer 2

弹簧集成桥的使用是否正确用于此目的还是有任何优化方式？

你可以这样走。没关系。但是，如果您在入站和出站渠道适配器上切换到channel属性，则不需要<bridge>

s3入站/出站适配器是否支持相同的目的？

正确。它们实际上提供了类似的功能，但适用于AWS S3协议。

注意：您的配置不清晰，看起来与问题无关...

使用spring集成进行文件拆分和分发

2 个答案: