创建用于计算直方图数据的实例化视图

时间:2019-12-18 18:20:16

标签: sql clickhouse

我创建了一个表:

db.users.aggregate([
   {
     $lookup:
       {
         from: "votes",
         localField: "_id",
         foreignField: "voterId",
         as: "votes"
       }
  }
])

我想创建一个物化视图来存储每小时直方图数据的价值。例如;

我期望这样的输出:

CREATE TABLE results
(
    id UUID,
    date_time DateTime,
    item_id UInt32,
    value UInt16
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(date_time)
ORDER BY (date_time, item_id);

toStartOfHour item_id value count 2019-12-18 00:00:00 1 0 4 /* number of rows with value between 0 and 100 and date_time between 2019-12-18 00:00:00 and 2019-12-18 01:00:00 */ 2019-12-18 00:00:00 1 100 7 /* number of rows with value between 100 and 200 and date_time between 2019-12-18 00:00:00 and 2019-12-18 01:00:00 */ 在100到0之间且valuedate_time2019-12-18 00:00:00之间的行数。我已经尝试过这样的事情:

2019-12-18 01:00:00

此实例化视图定义在填充时有效。但是随着时间和新行的出现,它会出错。怎么了我不知道。我找不到模式。

我不确定我是否在Clickhouse上发现了错误,或者我做错了什么。

我的物化视图定义正确吗?

2 个答案:

答案 0 :(得分:2)

AggregatingMT使用按(主键)的顺序作为Dimensions,所有其他列均为Metrics。如果度量标准列不具有“状态”功能,它将由ANY

计算/折叠
CREATE table results_histogram_by_hour
(date_time DateTime,
 item_id UInt32,
 value UInt16,
 count AggregateFunction(count) 
) ENGINE = AggregatingMergeTree() 
PARTITION BY toYYYYMMDD(date_time) 
ORDER BY (date_time, item_id)

insert into results_histogram_by_hour 
select toStartOfHour(now()) date_time,
       1 item_id,
       1 value,
       countState()
group by date_time, item_id, value;

insert into results_histogram_by_hour 
select toStartOfHour(now()) date_time,
       1 item_id,
      99 value,
       countState()
group by date_time, item_id, value;

optimize table results_histogram_by_hour final;

select * from results_histogram_by_hour;

┌───────────date_time─┬─item_id─┬─value─┬─count─┐
│ 2019-12-18 21:00:00 │       1 │     1 │       │
└─────────────────────┴─────────┴───────┴───────┘

ORDER BY (date_time, item_id , value)
┌───────────date_time─┬─item_id─┬─value─┬─count─┐
│ 2019-12-18 21:00:00 │       1 │     1 │       │
│ 2019-12-18 21:00:00 │       1 │    99 │       │
└─────────────────────┴─────────┴───────┴───────┘

如果不喜欢长/宽/高索引(PRIMARYKEY)的概念,则可以对ORDERBY / PRIMARYKEY使用不同的列集。所有引擎都使用ORDERBY列集进行合并/折叠。

答案 1 :(得分:1)

另一种方法

https://clickhouse.yandex/docs/en/operations/table_engines/summingmergetree/#nested-structures

SummingMergeTree能够对K / V数组中的值求和 (列应命名为... Map-value Map

CREATE TABLE results
(
    id UInt64,
    date_time DateTime,
    item_id UInt32,
    value UInt16
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(date_time)
ORDER BY (date_time, item_id);


insert into results 
select number,
       now(),
       number%7 item_id,
       number%9957 value
from numbers(10000);


CREATE MATERIALIZED VIEW results_histogram_by_hour
ENGINE = SummingMergeTree()
PARTITION BY toYYYYMMDD(date_time)
ORDER BY (date_time, item_id) POPULATE AS
SELECT
    date_time,
    item_id,
    groupArray(value) AS `valueMap.bin`,
    groupArray(cnt) AS `valueMap.cnt`
FROM
(
    SELECT
        toStartOfHour(date_time) AS date_time,
        item_id,
        intDiv(value, 1000) AS value,
        sum(toUInt64(1)) AS cnt
    FROM results
    GROUP BY
        date_time,
        item_id,
        value
)
GROUP BY
    date_time,
    item_id


insert into results 
select number,
       now(),
       number%7 item_id,
       number%9957 value
from numbers(10000);

SELECT *
FROM results_histogram_by_hour
WHERE item_id = 4

─item_id─┬─valueMap.bin──────────┬─valueMap.cnt──────────────────────────────┐
       4 │ [0,7,6,1,5,2,3,4,8,9] │ [149,143,143,143,143,142,143,143,143,136] │
─────────┴───────────────────────┴───────────────────────────────────────────┘
─item_id─┬─valueMap.bin──────────┬─valueMap.cnt──────────────────────────────┐
       4 │ [0,7,6,1,5,2,3,4,8,9] │ [149,143,143,143,143,142,143,143,143,136] │
─────────┴───────────────────────┴───────────────────────────────────────────┘    

SELECT
    date_time,
    item_id,
    sumMap(valueMap.bin, valueMap.cnt)
FROM results_histogram_by_hour
WHERE item_id = 4
GROUP BY
    date_time,
    item_id

─item_id─┬─sumMap(valueMap.bin, valueMap.cnt)────────────────────────────────┐
       4 │ ([0,1,2,3,4,5,6,7,8,9],[298,286,284,286,286,286,286,286,286,272]) │
─────────┴───────────────────────────────────────────────────────────────────┘

optimize table results_histogram_by_hour final;

SELECT *
FROM results_histogram_by_hour
WHERE item_id = 4

─item_id─┬─valueMap.bin──────────┬─valueMap.cnt──────────────────────────────┐
       4 │ [0,1,2,3,4,5,6,7,8,9] │ [298,286,284,286,286,286,286,286,286,272] │
─────────┴───────────────────────┴───────────────────────────────────────────┘