使用Java 8流分块文本文件

时间:2016-11-18 06:31:39

标签: java-8 java-stream

我正在尝试将文本文件(比如说,一个日志文件)分块,只选择一个no。一次处理的行(假设我们将日志文件拆分为较小的行)。我用命令式的方式编写了这段代码:

package utils;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.function.Consumer;

public class FileUtils {

    public static void main(String[] args) {
        readFileInChunks("D:\\demo.txt", 10000, System.out::println);
    }

    public static void readFileInChunks(String filePath, int chunkSize, Consumer<StringBuilder> processor) {
        try (BufferedReader br = new BufferedReader(new FileReader(filePath))) {
            StringBuilder lines = new StringBuilder();

            String line, firstLine = null;
            int i;
            for (i = 0; (line = br.readLine()) != null; i++) {
                if (firstLine == null)
                    firstLine = line;

                lines.append(line + "\n");

                if ((i + 1) % chunkSize == 0) {
                    processor.accept(lines);
                    lines = new StringBuilder(firstLine + "\n");
                }
            }

            if (lines.toString() != "") {
                processor.accept(lines);
            }

            br.close();

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}

这些年来,我一直在以迭代的方式编写代码而且我无法想出基于Java 8流的这种方法的功能样式实现。

是否可以让readFileInChunks方法返回Stream<String>个块?或者,以功能方式实施readFileInChunks

4 个答案:

答案 0 :(得分:2)

首先,为工作选择合适的工具。如果你想以块的形式处理文本文件,那么以块的形式读取文件要简单得多,而不是以行的形式读取它,只是稍后(重新)组合行。如果您希望将块剪切到行边界,则搜索最接近块边界的换行符仍然更简单,而不是处理所有换行符。

public static void readFileInChunks(
    String filePath, int chunkSize, Consumer<? super CharSequence> processor) {

    CharBuffer buf=CharBuffer.allocate(chunkSize);
    try(FileReader r = new FileReader(filePath)) {
        readMore: for(;;) {
            while(buf.hasRemaining()) if(r.read(buf)<0) break readMore;
            buf.flip();
            int oldLimit=buf.limit();
            for(int p=oldLimit-1; p>0; p--)
                if(buf.charAt(p)=='\n' || buf.charAt(p)=='\r') {
                    buf.limit(p+1);
                    break;
                }
            processor.accept(buf);
            buf.position(buf.limit()).limit(oldLimit);
            buf.compact();
        }
        if(buf.position()>0) {
            buf.flip();
            processor.accept(buf);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

这个代码乍一看可能看起来更复杂,但它是免费复制的。如果要允许使用者保持对接收对象的引用或执行并发处理,只需将行processor.accept(buf);更改为processor.accept(buf.toString());,这样它就不会将实际缓冲区传递给使用者。如果要提供与流相同的功能,则必须执行此操作。对于流,循环必须转换为一个函数,可以根据请求提供下一个项目:

public static Stream<String> fileInChunks(
        String filePath, int chunkSize) throws IOException {

    FileChannel ch=FileChannel.open(Paths.get(filePath), StandardOpenOption.READ);
    CharsetDecoder dec = Charset.defaultCharset().newDecoder();
    long size = (long)(ch.size()*dec.averageCharsPerByte());
    Reader r = Channels.newReader(ch, dec, chunkSize);
    return StreamSupport.stream(new Spliterators.AbstractSpliterator<String>(
            (size+chunkSize-1)/chunkSize, Spliterator.ORDERED|Spliterator.NONNULL) {
        CharBuffer buf=CharBuffer.allocate(chunkSize);
        public boolean tryAdvance(Consumer<? super String> processor) {
            CharBuffer buf=this.buf;
            if(buf==null) return false;
            boolean more=true;
            while(buf.hasRemaining() && more) try {
                if(r.read(buf)<0) more=false;
            } catch(IOException ex) { throw new UncheckedIOException(ex); }
            if(more) {
                buf.flip();
                int oldLimit=buf.limit();
                for(int p=oldLimit-1; p>0; p--)
                    if(buf.charAt(p)=='\n' || buf.charAt(p)=='\r') {
                        buf.limit(p+1);
                        break;
                    }
                processor.accept(buf.toString());
                buf.position(buf.limit()).limit(oldLimit);
                buf.compact();
                return true;
            }
            this.buf=null;
            if(buf.position()>0) {
                buf.flip();
                processor.accept(buf.toString());
                return true;
            }
            return false;
        }
    }, false);
}

答案 1 :(得分:1)

您可以定义自定义迭代器并根据它构建流:

public static Stream<String> readFileInChunks(String filePath, int chunkSize) throws IOException {
    return new TreeMap<>(StreamUtils.zipWithIndex(Files.lines(Paths.get(filePath)))
            .collect(Collectors.groupingBy(el -> el.getIndex() / chunkSize)))
            .values().stream()
            .map(list -> list.stream()
                    .map(el -> el.getValue())
                    .collect(Collectors.joining("\n")));
}

另一种选择是使用protonpack库,它提供 zipWithIndex 方法:

{{1}}

第二个解决方案更紧凑,但它会在对地图进行分组时收集地图中的所有行(然后将它们复制到TreeMap中,以便按正确的顺序排列),因此不适合处理非常大的行文件。

答案 2 :(得分:1)

我使用Java 8创建并测试了一个解决方案,如下所示:

  package com.grs.stackOverFlow.pack01;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Optional;
import java.util.function.Consumer;

    public class FileUtils {
        private static long processed=1;

        public static void main(String[] args) throws IOException {
            readFileInChunks("src/com/grs/stackOverFlow/pack01/demo.txt", 3, System.out::println);
        }

        public static void readFileInChunks(String filePath, int chunkSize, Consumer<StringBuilder> processor) throws IOException {

            List<String> lines = Files.readAllLines(Paths.get(filePath));
            String firstLine=lines.get(0);

            long splitCount=lines.size()<chunkSize?1:lines.size()/chunkSize;

            for(int i=1;i<=splitCount;i++){
                Optional<String> result=lines.stream()
                     .skip(processed)
                     .limit(chunkSize)
                     .reduce((a,b) -> {processed++; return a+ "\n"+ b;});
                //reduce increments processed one less time as it starts with 2 element at a time
                processed++;
                processor.accept(new StringBuilder("chunk no. = " + i +  "\n" + firstLine+ "\n"+ result.orElse("") ));
            }

        }

    }

答案 3 :(得分:0)

可以做的一件事就是有一个自定义收集器来构建这些块,然后将它们发送给消费者,例如这样(未编译,只是一个样本):

    private static final class ToChunksCollector<T> implements Collector<T, List<StringBuilder>, List<StringBuilder>> {

    private final int chunkSize;

    public ToChunksCollector(int chunkSize) {
        this.chunkSize = chunkSize;
    }

    @Override
    public Supplier<List<StringBuilder>> supplier() {
        return ArrayList::new;
    }

    @Override
    public BiConsumer<List<StringBuilder>, T> accumulator() {
        return (list, line) -> {
            if (list.size() == 0) {
                list.add(new StringBuilder());
            }
            StringBuilder lastBuilder = list.get(list.size() - 1);
            String[] linesInCurrentBuilder = lastBuilder.toString().split("\n");
            // no more room
            if (linesInCurrentBuilder.length == chunkSize) {
                String lastLine = linesInCurrentBuilder[chunkSize - 1];
                StringBuilder builder = new StringBuilder();
                builder.append(lastLine).append("\n");
                list.add(builder);
            } else {
                lastBuilder.append(line).append("\n");
            }
        };
    }

    @Override
    public BinaryOperator<List<StringBuilder>> combiner() {
        return (list1, list2) -> {
            list1.addAll(list2);
            return list1;
        };
    }

    @Override
    public Function<List<StringBuilder>, List<StringBuilder>> finisher() {
        return Function.identity();
    }

    // TODO add the relevant characterics
    @Override
    public Set<java.util.stream.Collector.Characteristics> characteristics() {
        return EnumSet.noneOf(Characteristics.class);
    }

}

然后用法:

public static void readFileInChunks(String filePath, int chunkSize, Consumer<StringBuilder> processor) {
    try (BufferedReader br = new BufferedReader(new FileReader(filePath))) {

        List<StringBuilder> builder = br.lines().collect(new ToChunksCollector<>(chunkSize));
        builder.stream().forEachOrdered(processor);

    } catch (IOException e) {
        e.printStackTrace();
    }
}