我正在将合法SQL查询转换为Bigquery中的标准SQL查询,以计算谷歌分析跳出率。但是在转换查询时,输出结果会有一些差异。
旧版SQL查询
SELECT session_bounceRate,
t1.source as source,t1.medium as medium,
total_session
FROM (
SELECT
IFNULL(t1.session_bounceSessionCount, 0) / t2.session_sessionId_distinct_count AS session_bounceRate,
t1.source,t1.medium,
t2.total_session_distinct_count AS total_session
FROM (
SELECT
INTEGER(session_bounceSession_distinct_count) AS session_bounceSessionCount,
source,
medium
FROM (
SELECT
COUNT(DISTINCT session_sessionId, 10000000) AS session_bounceSession_distinct_count, --Changes done (Count(x) in legacy sql stands for approx ,
source, --,so replaced it with APPROX_COUNT_DISTINCT to get approx count
medium from (
SELECT
SUM(IF(session_hitsType = 'event'
AND session_isInteraction_first = 1, 1, 0)) AS session_isEventInteraction_sum,
session_sessionId AS session_sessionId,
SUM(session_pageViews_sum) AS session_pageViews_sum_sum,
source,
medium
FROM (
SELECT
hits.type AS session_hitsType,
sessionId AS session_sessionId,
SUM(totals.pageviews) AS session_pageViews_sum,
FIRST(hits.isInteraction) AS session_isInteraction_first,
trafficSource.source AS source,
trafficSource.medium AS medium,
FROM
TABLE_DATE_RANGE([[test:test.session_streaming_], TIMESTAMP('2018-04-01'), TIMESTAMP('2018-04-30')) AS session_streaming
GROUP BY
source,
medium,
session_hitsType,
session_sessionId )
GROUP BY
source,
medium,
session_sessionId )
WHERE
(session_isEventInteraction_sum = 0
AND session_pageViews_sum_sum = 1)
GROUP BY
source,
medium ) ) AS t1
JOIN EACH (
SELECT
COUNT(DISTINCT sessionId, 10000000) AS session_sessionId_distinct_count,
trafficSource.source AS source,
trafficSource.medium AS medium,
COUNT(DISTINCT sessionId, 10000000) AS total_session_distinct_count
FROM
TABLE_DATE_RANGE([test:Test.session_streaming_], TIMESTAMP('2018-04-01'), TIMESTAMP('2018-04-30')) AS session_streaming
GROUP BY
source,
medium ) AS t2
ON
t1.source = t2.source
and t1.medium=t2.medium)
where t1.medium='zadv_display'
ORDER BY
total_session DESC
标准SQL查询
SELECT
IFNULL(t1.session_bounceSessionCount, 0) / t2.session_sessionId_distinct_count AS session_bounceRate,
t1.source,
t1.medium,
t2.total_session_distinct_count AS total_session
FROM (
SELECT
CAST(session_bounceSession_distinct_count AS INT64) AS session_bounceSessionCount,
source,
medium
FROM (
SELECT
APPROX_COUNT_DISTINCT(DISTINCT session_sessionId) AS session_bounceSession_distinct_count,
source,
medium
FROM (
SELECT
SUM(IF(session_hitsType = 'event'
AND session_isInteraction_first = 1, 1, 0)) AS session_isEventInteraction_sum,
session_sessionId AS session_sessionId,
SUM(session_pageViews_sum) AS session_pageViews_sum_sum,
source,
medium
FROM (
SELECT
session_hitsType,
session_sessionId,
source,
medium,
CASE
WHEN session_isInteraction_first = TRUE THEN 1
ELSE 0
END AS session_isInteraction_first,
SUM(session_pageViews_sum) AS session_pageViews_sum
FROM (
SELECT
hits.Type AS session_hitsType,
sessionId AS session_sessionId,
trafficSource.source AS source,
trafficSource.medium AS medium,
totals.pageviews AS session_pageViews_sum,
FIRST_VALUE(hits.isInteraction) OVER(PARTITION BY sessionId ORDER BY TIMESTAMP_SECONDS(hits.Time)) AS session_isInteraction_first
FROM
`test.Test.session_streaming_*`,
unNEST(hits) hits
WHERE
_table_suffix BETWEEN '20180401'
AND '20180430' )
GROUP BY
session_hitsType,
session_sessionId,
source,
medium,
session_isInteraction_first )
GROUP BY
source,
medium,
session_sessionId)
WHERE
(session_isEventInteraction_sum = 0
AND session_pageViews_sum_sum = 1)
GROUP BY
source,
medium)) AS t1
JOIN (
SELECT
APPROX_COUNT_DISTINCT(DISTINCT sessionId) AS session_sessionId_distinct_count,
trafficSource.source AS source,
trafficSource.medium AS medium,
APPROX_COUNT_DISTINCT(DISTINCT sessionId) AS total_session_distinct_count
FROM
`test.Test.session_streaming_*`
WHERE _table_suffix BETWEEN '20180401' AND '20180430'
GROUP BY
source,
medium
) AS t2
ON
t1.source = t2.source
AND t1.medium=t2.medium
where t1.medium='zadv_display'
order by total_session desc
我们已将标准sql中的First函数替换为First_value,这是查询中发生的显着变化。
有人可以指导我是否存在转换问题,因为标准的sql查询查询输出应与旧版输出匹配?
答案 0 :(得分:0)
我不确定您是否真的需要所有这些嵌套查询。 但是您应该充分利用数组上的子查询。例如。会话中的第一个事件交互信息如下:
SELECT
date,
visitStartTime,
(SELECT isInteraction FROM t.hits WHERE type='EVENT' ORDER BY hitNumber ASC LIMIT 1) AS isInteraction
FROM
`project.dataset.ga_sessions_20180624` AS t
LIMIT
1000
因此,只需将(struct-)数组视为较大表中的较小表即可。 如果仅聚合到会话级别,则根本不应该使表变平。 这也是解释向标准sql迁移的文档:https://cloud.google.com/bigquery/docs/reference/standard-sql/migrating-from-legacy-sql