Question

我在学士论文中使用Datastax Enterprise 4.8进行测试。我正在将数据加载到集群中（大约33 Mio行）。数据类似于以下

//id;unix timestamp; validity; station info; temp in °C; humidity in %
3;1950040101;5;24; 5.7000;83.0000
3;1950040102;5;24; 5.6000;83.0000
3;1950040103;5;24; 5.5000;83.0000

我知道我的数据模型不是很干净（我使用十进制作为时间戳，但我只是想这样尝试。）

CREATE TABLE temp{
    id int,
    timestamp decimal,
    validity decimal,
    structure decimal,
    temperature float,
    humidity float,
    PRIMARY KEY((id),timestamp));

我粗略地基于datastax网站上的一篇文章： https://academy.datastax.com/resources/getting-started-time-series-data-modeling 插入是基于经常提到的关于lostechies的文章

完成的

https://lostechies.com/ryansvihla/2016/04/29/cassandra-batch-loading-without-the-batch%E2%80%8A-%E2%80%8Athe-nuanced-edition/

这是我的插入代码：

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import com.datastax.driver.core.BoundStatement;
import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.ConsistencyLevel;
import com.datastax.driver.core.PreparedStatement;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.ResultSetFuture;
import com.datastax.driver.core.Session;
import com.datastax.driver.extras.codecs.jdk8.InstantCodec;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.MoreExecutors;

public class BulkLoader {

    private final int threads;
    private final String[] contactHosts;
    private final Cluster cluster;
    private final Session session;
    private final ExecutorService executor;

    public BulkLoader(int threads, String... contactHosts) {
        this.threads = threads;
        this.contactHosts = contactHosts;
        this.cluster = Cluster.builder().addContactPoints(contactHosts).build();

        cluster.getConfiguration().getCodecRegistry()
                .register(InstantCodec.instance);
        session = cluster.newSession();
        // fixed thread pool that closes on app exit
        executor = MoreExecutors
                .getExitingExecutorService((ThreadPoolExecutor) Executors
                        .newFixedThreadPool(threads));
    }

    public static class IngestCallback implements FutureCallback<ResultSet> {

        public void onSuccess(ResultSet result) {
        }

        public void onFailure(Throwable t) {
            throw new RuntimeException(t);
        }
    }

    public void ingest(Iterator<Object[]> boundItemsIterator, String insertCQL)
            throws InterruptedException {
        final PreparedStatement statement = session.prepare(insertCQL);
        while (boundItemsIterator.hasNext()) {
            BoundStatement boundStatement = statement.bind(boundItemsIterator
                    .next());
            boundStatement.setConsistencyLevel(ConsistencyLevel.QUORUM);
            ResultSetFuture future = session.executeAsync(boundStatement);
            Futures.addCallback(future, new IngestCallback(), executor);
        }
    }

    public void stop() {
        session.close();
        cluster.close();
    }


    public static List<Object[]> readCSV(File csv) {
        BufferedReader fileReader = null;
        List<Object[]> result = new LinkedList<Object[]>();
        try {
            fileReader = new BufferedReader(new FileReader(csv));
            String line = "";
            while ((line = fileReader.readLine()) != null) {
                String[] tokens = line.split(";");
                if (tokens.length < 6) {
                    System.out.println("Unvollständig");
                    continue;
                }
                Object[] tmp = new Object[6];
                tmp[0] = (int) Integer.parseInt(tokens[0]);
                tmp[1] = new BigDecimal(Integer.parseInt(tokens[1]));
                tmp[2] = new BigDecimal(Integer.parseInt(tokens[2]));
                tmp[3] = new BigDecimal(Integer.parseInt(tokens[2]));
                tmp[4] = (float) Float.parseFloat(tokens[4]);
                tmp[5] = (float) Float.parseFloat(tokens[5]);
                result.add(tmp);
            }
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            try {
                fileReader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return result;
    }

    public static void main(String[] args) {
        Stopwatch watch = Stopwatch.createStarted();
        File folder = new File(
                "C:/VirtualMachines/Kiosk/BachelorarbeitStraubinger/workspace/bulk/src/main/resources");
        List<Object[]> data = new LinkedList<Object[]>();
        BulkLoader loader = new BulkLoader(16, "10.2.57.38", "10.2.57.37",
                "10.2.57.36", "10.2.57.35", "10.2.57.34", "10.2.57.33");
        int cnt = 0;
        File[] listOfFiles = folder.listFiles();
        for (File file : listOfFiles) {
            if (file.isFile() && file.getName().contains(".th")) {
                data = readCSV(file);
                cnt += data.size();
                try {

                    loader.ingest(
                            data.iterator(),
                            "INSERT INTO wheather.temp (id, timestamp, validity, structure, temperature, humidity) VALUES(?,?,?,?,?,?)");
                } catch (InterruptedException e) {
                    e.printStackTrace();
                } finally {
                    System.out.println(file.getName()
                            + " -> Datasets importet: " + cnt);
                }
            }
        }
        System.out.println("total time seconds = "
                + watch.elapsed(TimeUnit.SECONDS));
        watch.stop();
        loader.stop();
    }
}

复制因子为3，我在6或3个节点上运行测试。启用vNode并且num_tokens = 256。在任一群集上运行它时，我获得大致相同的插入时间。任何想法为什么会这样？

Answer 1

您可能需要最大化客户端应用程序/客户端服务器。如果你正在从一个静态文件中读取数据，你可能会将它分成几块并同时运行它们，甚至可以看看Brian Hess＆＃39; loader（https://github.com/brianmhess/cassandra-loader）或真正的cassandra批量加载器（http://www.datastax.com/dev/blog/using-the-cassandra-bulk-loader-updated），它将数据转换为一系列sstables并直接传输它们。两者都可能比现有代码更快。

Answer 2

物理。

您可能最大限度地提高了应用的吞吐量。通常答案是拥有多个客户端/应用服务器，但看起来您正在从CSV中读取。我建议要么将片段切成碎片并运行应用程序的多个实例，要么生成虚假数据和多个实例。

编辑：我还认为值得注意的是，对于这样的数据模型，有效载荷大小小而且适当的硬件，我想象每个节点可能能够15-20K插入/第二（不考虑节点密度/压缩）。

Cassandra集群没有扩展。 3个节点甚至比6个节点快一点（代码和数据提供）

2 个答案: