我的数据传播到3个字符串列; col 1, col 2, col 3
。数据集大约有500k行,每月添加一行。
Row setting acceptance_rate undergrads
1 City N/A 2773
2 198 Town 77%
3 133 Suburban 56%
4 55% 254 Suburban
5 54% Rural 46
6 63% City 247
7 100% 210 Rural
我想为特定条件创建2个新列到组号。我希望新列acceptance_rate_new
包含介于0和1之间的所有数字,而population
的数字大于1.我认为以下CASE ... WHEN
足以完成此任务它适用于字符串到字符串,但这次不起作用。我想我需要每个月运行一次查询。
SELECT _name, COALESCE(
CASE WHEN INTEGER(col1) > 1 THEN INTEGER(col1) ELSE NULL END,
CASE WHEN INTEGER(col2) > 1 THEN INTEGER(col2) ELSE NULL END,
CASE WHEN INTEGER(col3) > 1 THEN INTEGER(col3) ELSE NULL END
) AS population_new
FROM
答案 0 :(得分:0)
下面是BigQuery Standard SQL和"重新分发"相应列的正确值(相对于列的当前随机分布)
使用它你可以把你需要的任何逻辑放在首位
#standardSQL
WITH `yourproject.yourdataset.yourtable` AS (
SELECT 'City' setting, 'N/A' acceptance_rate, '2773' undergrads UNION ALL
SELECT '198', 'Town', '77%' UNION ALL
SELECT '133', 'Suburban', '56%' UNION ALL
SELECT '55%', '254', 'Suburban' UNION ALL
SELECT '54%', 'Rural', '46' UNION ALL
SELECT '63%', 'City', '247' UNION ALL
SELECT '100%', '210', 'Rural'
)
SELECT setting, acceptance_rate, undergrads,
COALESCE(
IF(setting IN (undergrads_correct, acceptance_rate_correct, 'N/A'), NULL, setting),
IF(acceptance_rate IN (undergrads_correct, acceptance_rate_correct, 'N/A'), NULL, acceptance_rate),
IF(undergrads IN (undergrads_correct, acceptance_rate_correct, 'N/A'), NULL, undergrads)
) setting_correct,
CAST(undergrads_correct AS INT64) undergrads_correct,
acceptance_rate_correct
FROM
(
SELECT *,
COALESCE(
IF(NOT IS_NAN(SAFE_CAST(setting AS INT64)), setting, NULL),
IF(NOT IS_NAN(SAFE_CAST(acceptance_rate AS INT64)), acceptance_rate, NULL),
IF(NOT IS_NAN(SAFE_CAST(undergrads AS INT64)), undergrads, NULL)
) AS undergrads_correct,
COALESCE(
IF(REGEXP_CONTAINS(setting, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(setting, '%', '') AS INT64)), setting, NULL),
IF(REGEXP_CONTAINS(acceptance_rate, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(acceptance_rate, '%', '') AS INT64)), acceptance_rate, NULL),
IF(REGEXP_CONTAINS(undergrads, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(undergrads, '%', '') AS INT64)), undergrads, NULL)
) acceptance_rate_correct
FROM `yourproject.yourdataset.yourtable`
)
结果是
setting acceptance_rate undergrads setting_correct undergrads_correct acceptance_rate_correct
City N/A 2773 City 2773 null
63% City 247 City 247 63%
198 Town 77% Town 198 77%
54% Rural 46 Rural 46 54%
100% 210 Rural Rural 210 100%
133 Suburban 56% Suburban 133 56%
55% 254 Suburban Suburban 254 55%
以下是使用SQL UDF的版本(当然具有相同的输出)
#standardSQL
CREATE TEMP FUNCTION check_int(val STRING) AS (
(IF(NOT IS_NAN(SAFE_CAST(val AS INT64)), val, NULL))
);
CREATE TEMP FUNCTION check_rate(val STRING) AS (
IF(REGEXP_CONTAINS(val, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(val, '%', '') AS INT64)), val, NULL)
);
CREATE TEMP FUNCTION check_city(val STRUCT<setting STRING, acceptance_rate STRING, undergrads STRING, undergrads_correct STRING, acceptance_rate_correct STRING>) AS (
COALESCE(
IF(val.setting IN (val.undergrads_correct, val.acceptance_rate_correct, 'N/A'), NULL, val.setting),
IF(val.acceptance_rate IN (val.undergrads_correct, val.acceptance_rate_correct, 'N/A'), NULL, val.acceptance_rate),
IF(val.undergrads IN (val.undergrads_correct, val.acceptance_rate_correct, 'N/A'), NULL, val.undergrads)
)
);
WITH `yourproject.yourdataset.yourtable` AS (
SELECT 'City' setting, 'N/A' acceptance_rate, '2773' undergrads UNION ALL
SELECT '198', 'Town', '77%' UNION ALL
SELECT '133', 'Suburban', '56%' UNION ALL
SELECT '55%', '254', 'Suburban' UNION ALL
SELECT '54%', 'Rural', '46' UNION ALL
SELECT '63%', 'City', '247' UNION ALL
SELECT '100%', '210', 'Rural'
)
SELECT setting, acceptance_rate, undergrads,
check_city(STRUCT(setting, acceptance_rate, undergrads, undergrads_correct, acceptance_rate_correct)) setting_correct,
CAST(undergrads_correct AS INT64) undergrads_correct,
acceptance_rate_correct
FROM (
SELECT *,
COALESCE(check_int(setting),check_int(acceptance_rate),check_int(undergrads)) AS undergrads_correct,
COALESCE(check_rate(setting),check_rate(acceptance_rate),check_rate(undergrads)) acceptance_rate_correct
FROM `yourproject.yourdataset.yourtable`
)
更新 -
would I manually need to enter values for each row one by one?
您不需要手动输入所有数据 - 而是应删除包含所有虚拟数据的WITH部分,并将您的项目替换为您的项目。您可以使用自己的项目进行替换 - 类似于以下
#standardSQL
SELECT setting, acceptance_rate, undergrads,
COALESCE(
IF(setting IN (undergrads_correct, acceptance_rate_correct, 'N/A'), NULL, setting),
IF(acceptance_rate IN (undergrads_correct, acceptance_rate_correct, 'N/A'), NULL, acceptance_rate),
IF(undergrads IN (undergrads_correct, acceptance_rate_correct, 'N/A'), NULL, undergrads)
) setting_correct,
CAST(undergrads_correct AS INT64) undergrads_correct,
acceptance_rate_correct
FROM
(
SELECT *,
COALESCE(
IF(NOT IS_NAN(SAFE_CAST(setting AS INT64)), setting, NULL),
IF(NOT IS_NAN(SAFE_CAST(acceptance_rate AS INT64)), acceptance_rate, NULL),
IF(NOT IS_NAN(SAFE_CAST(undergrads AS INT64)), undergrads, NULL)
) AS undergrads_correct,
COALESCE(
IF(REGEXP_CONTAINS(setting, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(setting, '%', '') AS INT64)), setting, NULL),
IF(REGEXP_CONTAINS(acceptance_rate, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(acceptance_rate, '%', '') AS INT64)), acceptance_rate, NULL),
IF(REGEXP_CONTAINS(undergrads, '%$') AND NOT IS_NAN(SAFE_CAST(REPLACE(undergrads, '%', '') AS INT64)), undergrads, NULL)
) acceptance_rate_correct
FROM `myproject.mydataset.mytable`
)