CREATE TABLE atim_cloud.messages (
deviceid text,
channelname text,
time timestamp,
avgsignal float,
latitude float,
longitude float,
rssi float,
snr float,
stationid text,
value blob,
valuetype text,
PRIMARY KEY ((deviceid, channelname), time)
) WITH CLUSTERING ORDER BY (time DESC)
AND bloom_filter_fp_chance = 0.01
AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
AND comment = ''
AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND dclocal_read_repair_chance = 0.1
AND default_time_to_live = 0
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair_chance = 0.0
AND speculative_retry = '99.0PERCENTILE';
CREATE INDEX messages_deviceid_idx ON atim_cloud.messages (deviceid);
CREATE INDEX messages_channelname_idx ON atim_cloud.messages (channelname);
CREATE INDEX messages_time_idx ON atim_cloud.messages (time);
此表是针对大量数据(数百万行)而制作的。 简单的请求工作得很好:
SELECT * FROM messages WHERE deviceid ='1DB8D';
我得到了:
deviceid | channelname | time | avgsignal | latitude | longitude | rssi | snr | stationid | value | valuetype
----------+-------------+--------------------------+-----------+----------+-----------+--------+-------+-----------+------------+-----------
1DB8D | INDEX1 | 2015-07-26 22:21:59+0200 | 9.9 | 45 | 6 | -125.5 | 9.66 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 22:11:58+0200 | 9.89 | 45 | 6 | -125.5 | 9.85 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 22:01:59+0200 | 9.87 | 45 | 6 | -123.5 | 10.08 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 21:51:59+0200 | 9.83 | 45 | 6 | -125.5 | 9.8 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 21:41:59+0200 | 9.83 | 45 | 6 | -124.5 | 10.02 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 21:31:58+0200 | 9.8 | 45 | 6 | -126.5 | 10.35 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 21:21:59+0200 | 9.78 | 45 | 6 | -122.5 | 9.91 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 21:11:59+0200 | 9.82 | 45 | 6 | -130.5 | 8.85 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 21:01:59+0200 | 9.79 | 45 | 6 | -129.5 | 10.11 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 20:51:58+0200 | 9.77 | 45 | 6 | -124.5 | 10.06 | 0E00 | 0x00000000 | int
1DB8D | INDEX1 | 2015-07-26 20:41:59+0200 | 9.78 | 45 | 6 | -123.5 | 9.52 | 0E00 | 0x00000000 | int
但是,当我使用时间戳的计算执行更复杂的请求时,例如: (它发生了几次,但不是所有的时间)
SELECT * FROM messages WHERE deviceid = '1DB8D' AND time >= 1437981692831 LIMIT 500 ALLOW FILTERING ;
或简单地说:
SELECT COUNT(*) FROM messages ;
我得到(经过一段时间。我想超时):
errors={}, last_host=127.0.0.1
您有什么建议可以解决我的问题吗? 我正在寻找关于索引或主键的一些建议,但我没有找到任何东西。
如果您有一些提示来执行此数据表,我很高兴听到它。那么多集群呢?我对此一无所知。
谢谢;)
答案 0 :(得分:3)
仅建议对基数较低的字段使用辅助索引。对于具有高基数的字段(例如您的时间字段),它们效率非常低。这就是为什么在查询中使用时间字段时会出现超时错误的原因。
在Cassandra中,您应该专注于使用好的主键,而不是通过创建辅助键来修复架构问题。
答案 1 :(得分:1)
只是为了回应Jim说的话,但是二级索引并不是真正意图成为一个"魔术子弹,"用于修复数据模型的缺点。您要做的是围绕您的查询构建数据模型。
好奇,但channelname
用于什么?你有它作为分区键,但看起来你真的不想通过它查询。你是用它作为"桶"帮助保持每个分区的列数低于20亿?或者它只是为了额外的独特性?
假设你不需要它作为"桶,"如果我像这样重构你的PRIMARY KEY:
PRIMARY KEY (deviceid, time, channelname)
然后,此查询有效:
aploetz@cqlsh:stackoverflow2> SELECT * FROM messages WHERE deviceid ='1DB8D';
deviceid | time | channelname | avgsignal | latitude | longitude | rssi | snr | stationid | value | valuetype
----------+--------------------------+-------------+-----------+----------+-----------+--------+-------+-----------+------------+-----------
1DB8D | 2015-07-26 15:21:59-0500 | INDEX1 | 9.9 | 45 | 6 | -125.5 | 9.66 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 15:11:58-0500 | INDEX1 | 9.89 | 45 | 5 | -125.5 | 9.85 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 15:01:59-0500 | INDEX1 | 9.87 | 45 | 5 | -123.5 | 10.08 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:51:59-0500 | INDEX1 | 9.83 | 45 | 5 | -125.5 | 9.8 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:41:59-0500 | INDEX1 | 9.83 | 45 | 5 | -124.5 | 10.02 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:31:58-0500 | INDEX1 | 9.8 | 45 | 5 | -126.5 | 10.35 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:21:59-0500 | INDEX1 | 9.78 | 45 | 5 | -122.5 | 9.91 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:11:59-0500 | INDEX1 | 9.82 | 45 | 5 | -130.5 | 8.85 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:01:59-0500 | INDEX1 | 9.79 | 45 | 5 | -129.5 | 10.11 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 13:51:58-0500 | INDEX1 | 9.77 | 45 | 5 | -124.5 | 10.06 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 13:41:59-0500 | INDEX1 | 9.78 | 45 | 5 | -123.5 | 9.52 | 0E00 | 0x00000000 | int
(11 rows)
就像这一样:
aploetz@cqlsh:stackoverflow2> SELECT * FROM messages WHERE deviceid ='1DB8D' AND time >='2015-07-26 14:00:00-0500';
deviceid | time | channelname | avgsignal | latitude | longitude | rssi | snr | stationid | value | valuetype
----------+--------------------------+-------------+-----------+----------+-----------+--------+-------+-----------+------------+-----------
1DB8D | 2015-07-26 15:21:59-0500 | INDEX1 | 9.9 | 45 | 6 | -125.5 | 9.66 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 15:11:58-0500 | INDEX1 | 9.89 | 45 | 5 | -125.5 | 9.85 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 15:01:59-0500 | INDEX1 | 9.87 | 45 | 5 | -123.5 | 10.08 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:51:59-0500 | INDEX1 | 9.83 | 45 | 5 | -125.5 | 9.8 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:41:59-0500 | INDEX1 | 9.83 | 45 | 5 | -124.5 | 10.02 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:31:58-0500 | INDEX1 | 9.8 | 45 | 5 | -126.5 | 10.35 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:21:59-0500 | INDEX1 | 9.78 | 45 | 5 | -122.5 | 9.91 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:11:59-0500 | INDEX1 | 9.82 | 45 | 5 | -130.5 | 8.85 | 0E00 | 0x00000000 | int
1DB8D | 2015-07-26 14:01:59-0500 | INDEX1 | 9.79 | 45 | 5 | -129.5 | 10.11 | 0E00 | 0x00000000 | int
(9 rows)
更重要的是,它们在没有二级索引或ALLOW FILTERING
指令的情况下工作。此PRIMARY KEY还支持完整的分区键(deviceid
)查询,因此这些查询应该运行良好。
SELECT COUNT(*)FROM messages;
是的,COUNT
的Cassandra / CQL实现的表现几乎和它的RDBMS / SQL堂兄一样好。 Apple的Richard Low(以及DataStax MVP)撰写了一篇文章(" Counting Keys In Cassandra"),关于COUNT
如何运作"引擎盖下,"它绝对值得一读。从本质上讲,COUNT
必须遍历每一行才能为您提供该号码。因此,如果您针对大型列族(表)运行该操作,那么将超时。
注意:如果您发现 需要一个存储桶(为了保持每个分区的列数低于20亿),您可以选择对您的数据有意义的时间/日期增量,以及在该窗口内查询。例如,上面你可以将你的桶的日期(作为text
字段)构建为PRIMARY KEY ((deviceid, datebucket), time, channelname)
的分区键,然后像这样查询:
SELECT * FROM messages WHERE deviceid ='1DB8D' AND datebucket='20150726';
也许看看这样的事情对你有意义。
编辑: DataStax的Patrick McFadin撰写了一篇关于Getting Started With Time Series Data Modeling的精彩文章。在本文中,他描述了使用"时间桶"进一步划分时间序列应用程序中的表。这样可以帮助您避免达到20亿列每分区限制,并且还可以帮助Cassandra避免那些您当时并不真正关心的(旧)数据。
答案 2 :(得分:0)
我有时间尝试你给我的解决方案: 所以我创建了另外两个表来尝试不同的请求(我需要更少,这3个请求如下):
SELECT * FROM messages WHERE deviceid = '1DB8D' AND time >= 1438853606718 ALLOW FILTERING ;
SELECT * FROM messages WHERE deviceid = '1DB8D' AND channelname = 'brutMessage';
SELECT * FROM messages WHERE deviceid = '1DB8D' AND channelname = 'brutMessage' AND time >= 1438853606718;
所以这是我的结果:
PRIMARY KEY ((deviceid, channelname), time) ) WITH CLUSTERING ORDER BY (time DESC)
SELECT * FROM messages WHERE deviceid = '1DB8D' AND time >= 1438853606718 ALLOW FILTERING ; <==== errors={}, last_host=127.0.0.1
SELECT * FROM messages WHERE deviceid = '1DB8D' AND channelname = 'brutMessage'; <=== OK
SELECT * FROM messages WHERE deviceid = '1DB8D' AND channelname = 'brutMessage' AND time >= 1438853606718; <=== OK
PRIMARY KEY (deviceid, time, channelname)) WITH CLUSTERING ORDER BY (time DESC, channelname ASC)
SELECT * FROM messages_test WHERE deviceid = '1DB8D' AND time >= 1438853606718 ; <==== OK
SELECT * FROM messages_test WHERE deviceid = '1DB8D' AND channelname='brutMessage' AND time >= 1438853606718 ; <==== code=2200 [Invalid query] message="PRIMARY KEY column "channelname" cannot be restricted
(preceding column "ColumnDefinition{name=time, type=org.apache.cassandra.db.marshal.ReversedType
(org.apache.cassandra.db.marshal.TimestampType), kind=CLUSTERING_COLUMN, componentIndex=0,
indexName=null, indexType=null}" is either not restricted or by a non-EQ relation)"
SELECT * FROM messages_test WHERE deviceid = '1DB8D' AND channelname='brutMessage'; <==== code=2200 [Invalid query] message="PRIMARY KEY column "channelname" cannot be restricted
(preceding column "ColumnDefinition{name=time, type=org.apache.cassandra.db.marshal.ReversedType
(org.apache.cassandra.db.marshal.TimestampType), kind=CLUSTERING_COLUMN, componentIndex=0,
indexName=null, indexType=null}" is either not restricted or by a non-EQ relation)"
PRIMARY KEY (deviceid, channelname, time)) WITH CLUSTERING ORDER BY (channelname ASC, time DESC)
SELECT * FROM messages_test2 WHERE deviceid = '1DB8D' AND time >= 1438853606718 ; <==== code=2200 [Invalid query] message="PRIMARY KEY column "time" cannot be restricted
(preceding column "ColumnDefinition{name=channelname, type=org.apache.cassandra.db.marshal.UTF8Type,
kind=CLUSTERING_COLUMN, componentIndex=0, indexName=null, indexType=null}"
is either not restricted or by a non-EQ relation)"
SELECT * FROM messages_test2 WHERE deviceid = '1DB8D' AND channelname = 'brutMessage' AND time >= 1438853606718 ; <==== OK
SELECT * FROM messages_test2 WHERE deviceid = '1DB8D' AND channelname = 'brutMessage'; <====== OK
PS:感谢@ BryceAtNetwork23,他向我解释了如何使用COPY TO / FROM方法复制表。