加快在Java中读取CSV的速度

时间:2019-10-25 03:19:03

标签: java csv

我的CSVReader代码相对无效,请参见下文。读取30000多行需要30秒钟以上。如何尽快加快阅读过程?

public class DataReader {

    private String csvFile;
    private List<String> sub = new ArrayList<String>();
    private List<List> master = new ArrayList<List>();


    public void ReadFromCSV(String csvFile) {

        String line = "";
        String cvsSplitBy = ",";

        try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) {
            System.out.println("Header " + br.readLine());
            while ((line = br.readLine()) != null) {

                // use comma as separator
                String[] list = line.split(cvsSplitBy);
//                System.out.println("the size is " + country[1]);
                for (int i = 0; i < list.length; i++) {
                    sub.add(list[i]);
                }
                List<String> temp = (List<String>) ((ArrayList<String>) sub).clone();
//                master.add(new ArrayList<String>(sub));
                master.add(temp);
                sub.removeAll(sub);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println(master);
    }

    public List<List> getMaster() {
        return master;
    }

}

更新:我发现我的代码单独运行实际上可以在不到1秒的时间内完成读取工作。由于此DataReader是我的仿真模型用来初始化相关属性的一部分。接下来的部分与导入数据的使用相关,这需要40秒才能完成!任何人都可以通过查看代码的通用部分来提供帮助?

//      add route network
        Network<Object> net = (Network<Object>)context.getProjection("IntraCity Network");
        IndexedIterable<Object> local_hubs = context.getObjects(LocalHub.class);
        for (int i = 0; i <= CSV_reader_route.getMaster().size() - 1; i++) {
            String source = (String) CSV_reader_route.getMaster().get(i).get(0);
            String target = (String) CSV_reader_route.getMaster().get(i).get(3);
            double dist = Double.parseDouble((String) CSV_reader_route.getMaster().get(i).get(6));
            double time = Double.parseDouble((String) CSV_reader_route.getMaster().get(i).get(7));

            Object source_hub = null;
            Object target_hub = null;
            Query<Object> source_query = new PropertyEquals<Object>(context, "hub_code", source);
            for (Object o : source_query.query()) {
                if (o instanceof LocalHub) {
                    source_hub = (LocalHub) o;
                }
                if (o instanceof GatewayHub) {
                    source_hub = (GatewayHub) o;
                }
            }

            Query<Object> target_query = new PropertyEquals<Object>(context, "hub_code", target);
            for (Object o : target_query.query()) {
                if (o instanceof LocalHub) {
                    target_hub = (LocalHub) o;
                }
                if (o instanceof GatewayHub) {
                    target_hub = (GatewayHub) o;
                }
            }

//          System.out.println(target_hub.getClass() + " " + time);
//          Route this_route = (Route) net.addEdge(source_hub, target_hub);
//          context.add(this_route);
//          System.out.println(net.getEdge(source_hub, target_hub));
            if (net.getEdge(source, target) == null) {
                Route this_route = (Route) net.addEdge(source, target);
                context.add(this_route);
//              this_route.setDist(dist);
//              this_route.setTime(time); }
            }



        } 

3 个答案:

答案 0 :(得分:2)

我没有那么大的CSV ,但是您可以尝试以下操作:

public static void main(String[] args) throws IOException {
    Path csvPath = Paths.get("path/to/file.csv");
    List<List<String>> master = Files.lines(csvPath)
            .skip(1)
            .map(line -> Arrays.asList(line.split(",")))
            .collect(Collectors.toList());
}

编辑:我使用了{50}条目的CSV sample进行了尝试,代码运行时间不到一秒钟。

答案 1 :(得分:2)

在您的代码中,您正在执行许多写操作,只是将不需要的主列表中当前行的值列表添加到主列表中。您可以使用下面给出的简单代码替换现有代码。

现有代码:

String[] list = line.split(cvsSplitBy);
//                System.out.println("the size is " + country[1]);
for (int i = 0; i &lt; list.length; i++) {
    sub.add(list[i]);
}

List<String> temp = (List<String>) ((ArrayList<String>) sub).clone();
//                master.add(new ArrayList<String>(sub));
master.add(temp);
sub.removeAll(sub);

建议的代码:

master.add(Arrays.asList(line.split(cvsSplitBy)));

答案 2 :(得分:1)

扩展到@Alex R的答案,您可以像这样并行处理它:

def stitch(arrs, dtype=None):
    if len(arrs) < 2:
        raise ValueError("Not supported")
    res = np.empty(sum(x.size - 1 for x in arrs) + 1, dtype=dtype)
    idx = 0
    res[0] = arrs[0][0]
    for i in range(len(arrs) - 1):
        a, b = arrs[i], arrs[i + 1]
        off = a.size - 1
        res[idx+1:idx+off] = a[1:-1]
        res[idx+off] = a[-1] + b[0]
        idx += off
    res[idx+1:] = arrs[-1][1:]
    return res