使用多个(python)客户端并行加载cassandra中的所有行

时间:2013-04-22 01:03:27

标签: python parallel-processing cassandra pycassa cql3

使用Cassandra推荐的RandomPartitioner(或Murmur3Partitioner)时,无法对键进行有意义的范围查询,因为行是distributed around the cluster using the md5 hash of the key。这些哈希称为“令牌”。

尽管如此,通过为每个计算工作者分配一个标记范围来分割大表是非常有用的。使用CQL3,似乎可能issue queries directly against the tokens,但是下面的python 不起作用 ...编辑:在切换到对cassandra数据库的最新版本(doh!)进行测试后工作,并且还更新了以下每个笔记的语法:

## use python cql module
import cql

## If running against an old version of Cassandra, this raises: 
## TApplicationException: Invalid method name: 'set_cql_version'
conn = cql.connect('localhost', cql_version='3.0.2')

cursor = conn.cursor()

try:
    ## remove the previous attempt to make this work
    cursor.execute('DROP KEYSPACE test;')
except Exception, exc:
    print exc

## make a keyspace and a simple table
cursor.execute("CREATE KEYSPACE test WITH strategy_class = 'SimpleStrategy' AND strategy_options:replication_factor = 1;")
cursor.execute("USE test;")
cursor.execute('CREATE TABLE data (k int PRIMARY KEY, v varchar);')

## put some data in the table -- must use single quotes around literals, not double quotes                                                                                                                                   
cursor.execute("INSERT INTO data (k, v) VALUES (0, 'a');")
cursor.execute("INSERT INTO data (k, v) VALUES (1, 'b');")
cursor.execute("INSERT INTO data (k, v) VALUES (2, 'c');")
cursor.execute("INSERT INTO data (k, v) VALUES (3, 'd');")

## split up the full range of tokens.
## Suppose there are 2**k workers:
k = 3 # --> eight workers
token_sub_range = 2**(127 - k)
worker_num = 2 # for example
start_token =    worker_num  * token_sub_range
end_token = (1 + worker_num) * token_sub_range

## put single quotes around the token strings
cql3_command = "SELECT k, v FROM data WHERE token(k) >= '%d' AND token(k) < '%d';" % (start_token, end_token)
print cql3_command

## this fails with "ProgrammingError: Bad Request: line 1:28 no viable alternative at input 'token'"
cursor.execute(cql3_command)

for row in cursor:
    print row

cursor.close()
conn.close()

我希望能与pycassa一起使用,因为我更喜欢它更加pythonic的界面。

有更好的方法吗?

2 个答案:

答案 0 :(得分:1)

我已更新问题以包含答案。

答案 1 :(得分:0)

这不是CQL3,但这是一个简单的程序,可以直接使用Thrift接口读取localhost拥有的所有(pickle)数据。这可用于构建一个简单的map / reduce引擎,使用Cassandra作为后端。每个节点都会像这样运行map()而不是属于自己的数据,从而不会产生数据检索的网络开销。然后,结果将被发送回单独节点上的reduce()阶段。

显然,这对于Cassandra1.2 +中的vnode效果不佳。我现在正在使用索引方法,允许在较小的本地数据子集上使用map()并支持vnodes。

#!/usr/bin/env python2.7

import sys
import socket
import cPickle as pickle
from thrift import Thrift
from thrift.transport import TTransport
from thrift.transport import TSocket
from pycassa.cassandra import Cassandra
from pycassa.cassandra.ttypes import *
import time
import pprint

def main():
    jobname = sys.argv[1]
    pp = pprint.PrettyPrinter(indent=2)

    (client, transport) = connect("localhost")

    # Determine local IP address
    ip = socket.gethostbyname(socket.gethostname())

    # Set up query
    keyspace = "data"
    column_parent = ColumnParent(column_family=foo)

    try:
        # Find range of tokens for which this node is first replica
        for tokenrange in client.describe_ring(keyspace):
            if tokenrange.endpoints[0] == ip:
                start_token=tokenrange.start_token
                end_token=tokenrange.end_token
                break

        # Set kesypace
        client.set_keyspace(keyspace)

        # Query for all data owned by this node
        slice_range = SliceRange(start="", finish="")
        predicate = SlicePredicate(slice_range=slice_range)
        keyrange = KeyRange(start_token=start_token, end_token=end_token, count=10000)
        t0 = time.time()
        ptime = 0
        keycount = 0
        start=""
        for keyslice in client.get_range_slices(column_parent, predicate, keyrange, ConsistencyLevel.ONE):
            keycount += 1
            for col in keyslice.columns:
                pt0 = time.time()
                data = pickle.loads(col.column.value)
                ptime += time.time() - pt0
    except Thrift.TException, tx:
        print 'Thrift: %s' % tx.message
    finally:
        disconnect(transport)

    t1 = time.time() - t0
    print "Read data for %d tasks in: %.2gs" %(keycount, t1)
    print "Job unpickling time: %.2gs" %ptime
    print "Unpickling percentage: %.2f%%" %(ptime/t1*100)

def connect(host):
    """ 
    Connect to cassandra instance on given host.
    Returns: Cassandra.Client object
    """
    socket = TSocket.TSocket(host, 9160)
    transport = TTransport.TFramedTransport(socket)
    protocol = TBinaryProtocol.TBinaryProtocolAccelerated(transport)
    transport.open()
    client = Cassandra.Client(protocol) 
    return (client, transport)

def disconnect(transport):
    """ 
    Disconnect from cassandra instance
    """
    transport.close()

if __name__ == '__main__':
    main()