假设我们有下表
INSERT INTO my_table (column_1, column_2, column_3, price) VALUES
(1, NULL, 1, 54.99),
(1, NULL, 1, 69.50),
(NULL, 2, 2, 54.99),
(NULL, 2, 2, 69.50),
(3, 3, NULL, 54.99),
(3, 3, NULL, 69.50);
使用以下数据
CREATE TABLE my_table_aggregations AS
SELECT
ROW_NUMBER() OVER () AS aggregation_id,
column_1,
column_2,
column_3
FROM my_table
GROUP BY
column_1,
column_2,
column_3;
现在我们做类似
的事情t1.column_1 = t2.column_1
我现在要做的是为my_table中的每个record_id分配一个aggregation_id。现在因为我有NULL值,我不能简单地按NULL = NULL
加入,因为SELECT
t.record_id,
agg.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
((t.column_1 IS NULL AND agg.column_1 IS NULL) OR t.column_1 = agg.column_1) AND
((t.column_2 IS NULL AND agg.column_2 IS NULL) OR t.column_2 = agg.column_2) AND
((t.column_3 IS NULL AND agg.column_3 IS NULL) OR t.column_3 = agg.column_3)
);
为NULL,因此连接将排除这些记录。
现在我知道我应该使用这样的东西
SELECT
t.record_id,
agg.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
COALESCE(t.column_1, -1) = COALESCE(agg.column_1, -1) AND
COALESCE(t.column_2, -1) = COALESCE(agg.column_2, -1) AND
COALESCE(t.column_3, -1) = COALESCE(agg.column_3, -1)
);
这里的问题是我正在处理数以亿计的记录,并且连接中的OR似乎需要永远运行。
有另一种选择,就像这样
DENSE_RANK
但问题是我假设这些列中没有任何值为-1。
请注意,这是一个我很清楚的示例,我可以使用COALESCE
来获得相同的结果。所以我们假装这不是一个选择。
是否有一些疯狂的方法来绕过必须使用OR
,但使用COALESCE
的正确方法保持其性能?我运行测试,OR
的速度比$options = [
'trace' => true,
'cache_wsdl' => WSDL_CACHE_NONE
];
$credentials = [
'username' => 'username'
'password' => 'password'
];
$header = new SoapHeader($NAMESPACE, 'AuthentificationInfo', $credentials);
$client = new SoapClient($WSDL, $options); // null for non-wsdl mode
$client->__setSoapHeaders($header);
$params = [
// Your parameters
];
$result = $client->GetResult($params);
// 'GetResult' being the name of the soap method
if (is_soap_fault($result)) {
error_log("SOAP Fault: (faultcode: {$result->faultcode}, faultstring: {$result->faultstring})");
}
快10倍。
我在Greenplum数据库上运行它,所以我不确定在标准的Postgres数据库上这种性能差异是否相同。
答案 0 :(得分:1)
由于我的NULLIF解决方案存在性能问题,而且您对COALESCE的使用速度要快得多,我想知道您是否可以尝试调整该解决方案来解决-1的问题。为此,您可以尝试强制转换以避免错误匹配。我不确定性能会受到什么影响,但它看起来像是:
SELECT
t.record_id,
agg.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
COALESCE(cast(t.column_1 as varchar), 'NA') =
COALESCE(cast(agg.column_1 as varchar), 'NA') AND
COALESCE(cast(t.column_2 as varchar), 'NA') =
COALESCE(cast(agg.column_2 as varchar), 'NA') AND
COALESCE(cast(t.column_3 as varchar), 'NA') =
COALESCE(cast(agg.column_3 as varchar), 'NA')
);
答案 1 :(得分:1)
在做了一些思考后,我决定了最好的方法,这是为每个列动态查找一个值,可以用作COALESCE
连接中的第二个参数。该功能相当长,但它可以满足我的需求,更重要的是,这种方式可以保持COALESCE
性能,唯一的缺点是获得MIN
值是额外的时间成本,但我们正在谈论一分钟。
这是功能:
CREATE OR REPLACE FUNCTION pg_temp.get_null_join_int_value
(
left_table_schema TEXT,
left_table_name TEXT,
left_table_columns TEXT[],
right_table_schema TEXT,
right_table_name TEXT,
right_table_columns TEXT[],
output_table_schema TEXT,
output_table_name TEXT
) RETURNS TEXT AS
$$
DECLARE
colum_name TEXT;
sql TEXT;
complete_sql TEXT;
full_left_table TEXT;
full_right_table TEXT;
full_output_table TEXT;
BEGIN
/*****************************
VALIDATE PARAMS
******************************/
-- this section validates all of the function parameters ensuring that the values that cannot be NULL are not so
-- also checks for empty arrays which is not allowed and then ensures both arrays are of the same length
IF (left_table_name IS NULL) THEN
RAISE EXCEPTION 'left_table_name cannot be NULL';
ELSIF (left_table_columns IS NULL) THEN
RAISE EXCEPTION 'left_table_columns cannot be NULL';
ELSIF (right_table_name IS NULL) THEN
RAISE EXCEPTION 'right_table_name cannot be NULL';
ELSIF (right_table_columns IS NULL) THEN
RAISE EXCEPTION 'right_table_columns cannot be NULL';
ELSIF (output_table_name IS NULL) THEN
RAISE EXCEPTION 'output_table_name cannot be NULL';
ELSIF (array_upper(left_table_columns, 1) IS NULL) THEN
RAISE EXCEPTION 'left_table_columns cannot be an empty array';
ELSIF (array_upper(right_table_columns, 1) IS NULL) THEN
RAISE EXCEPTION 'right_table_columns cannot be an empty array';
ELSIF (array_upper(left_table_columns, 1) <> array_upper(right_table_columns, 1)) THEN
RAISE EXCEPTION 'left_table_columns and right_table_columns must have a matching array length';
END IF;
/************************
TABLE NAMES
*************************/
-- create the full name of the left table
-- the schema name can be NULL which means that the table is temporary
-- because of this, we need to detect if we should specify the schema
IF (left_table_schema IS NOT NULL) THEN
full_left_table = left_table_schema || '.' || left_table_name;
ELSE
full_left_table = left_table_name;
END IF;
-- create the full name of the right table
-- the schema name can be NULL which means that the table is temporary
-- because of this, we need to detect if we should specify the schema
IF (right_table_schema IS NOT NULL) THEN
full_right_table = right_table_schema || '.' || right_table_name;
ELSE
full_right_table = right_table_name;
END IF;
-- create the full name of the output table
-- the schema name can be NULL which means that the table is temporary
-- because of this, we need to detect if we should specify the schema
IF (output_table_schema IS NOT NULL) THEN
full_output_table = output_table_schema || '.' || output_table_name;
ELSE
full_output_table = output_table_name;
END IF;
/**********************
LEFT TABLE
***********************/
-- start to create the table which will store the min values from the left table
sql =
'DROP TABLE IF EXISTS temp_null_join_left_table;' || E'\n' ||
'CREATE TEMP TABLE temp_null_join_left_table AS' || E'\n' ||
'SELECT';
-- loop through each column name in the left table column names parameter
FOR colum_name IN SELECT UNNEST(left_table_columns) LOOP
-- find the minimum value in this column and subtract one
-- we will use this as a value we know is not in the column of this table
sql = sql || E'\n\t' || 'MIN("' || colum_name || '")-1 AS "' || colum_name || '",';
END LOOP;
-- remove the trailing comma from the SQL
sql = TRIM(TRAILING ',' FROM sql);
-- finish the SQL to create the left table min values
sql = sql || E'\n' ||
'FROM ' || full_left_table || ';';
-- run the query that creates the table which stores the minimum values for each column in the left table
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = sql;
/************************
RIGHT TABLE
*************************/
-- start to create the table which will store the min values from the right table
sql =
'DROP TABLE IF EXISTS temp_null_join_right_table;' || E'\n' ||
'CREATE TEMP TABLE temp_null_join_right_table AS' || E'\n' ||
'SELECT';
-- loop through each column name in the right table column names parameter
FOR colum_name IN SELECT UNNEST(right_table_columns) LOOP
-- find the minimum value in this column and subtract one
-- we will use this as a value we know is not in the column of this table
sql = sql || E'\n\t' || 'MIN("' || colum_name || '")-1 AS "' || colum_name || '",';
END LOOP;
-- remove the trailing comma from the SQL
sql = TRIM(TRAILING ',' FROM sql);
-- finish the SQL to create the right table min values
sql = sql || E'\n' ||
'FROM ' || full_left_table || ';';
-- run the query that creates the table which stores the minimum values for each column in the right table
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = complete_sql || E'\n\n' || sql;
-- start to create the final output table which will contain the column names defined in the left_table_columns parameter
-- each column will contain a negative value that is not present in both the left and right tables for the given column
sql =
'DROP TABLE IF EXISTS ' || full_output_table || ';' || E'\n' ||
'CREATE ' || (CASE WHEN output_table_schema IS NULL THEN 'TEMP ' END) || 'TABLE ' || full_output_table || ' AS' || E'\n' ||
'SELECT';
-- loop through each index of the left_table_columns array
FOR i IN coalesce(array_lower(left_table_columns, 1), 1)..coalesce(array_upper(left_table_columns, 1), 1) LOOP
-- add to the sql a call to the LEAST function
-- this function takes an infinite number of columns and returns the smallest value within those columns
-- we have -1 hardcoded because the smallest minimum value may be a positive integer and so we need to ensure the number used is negative
-- this way we will not confuse this value with a real ID from a table
sql = sql || E'\n\t' || 'LEAST(l."' || left_table_columns[i] || '", r."' || right_table_columns[i] || '", -1) AS "' || left_table_columns[i] || '",';
END LOOP;
-- remove the trailing comma from the SQL
sql = TRIM(TRAILING ',' FROM sql);
-- finish off the SQL which creates the final table
sql = sql || E'\n' ||
'FROM temp_null_join_left_table l' || E'\n' ||
'CROSS JOIN temp_null_join_right_table r' || ';';
-- create the final table
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = complete_sql || E'\n\n' || sql;
-- we no longer need these tables
sql =
'DROP TABLE IF EXISTS temp_null_join_left_table;' || E'\n' ||
'DROP TABLE IF EXISTS temp_null_join_right_table;';
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = complete_sql || E'\n\n' || sql;
-- return the SQL that has been run, good for debugging purposes or just understanding what the function does
RETURN complete_sql;
END;
$$
LANGUAGE plpgsql;
以下是函数
的示例用法SELECT pg_temp.get_null_join_int_value
(
-- left table
'public',
'my_table',
'{"column_1", "column_2", "column_3"}',
-- right table
'public',
'my_table_aggregations',
'{"column_1", "column_2", "column_3"}',
-- output table
NULL,
'temp_null_join_values'
);
创建temp_null_join_values
表后,您可以在COALESCE
第二个参数的联接中进行子选择。
DROP TABLE IF EXISTS temp_result_table;
CREATE TEMP TABLE temp_result_table AS
SELECT
t.record_id,
agg.aggregation_id
FROM public.my_table t
JOIN my_table_aggregations agg ON
(
COALESCE(t.column_1, (SELECT column_1 FROM temp_null_join_values)) = COALESCE(agg.column_1, (SELECT column_1 FROM temp_null_join_values)) AND
COALESCE(t.column_2, (SELECT column_2 FROM temp_null_join_values)) = COALESCE(agg.column_2, (SELECT column_2 FROM temp_null_join_values)) AND
COALESCE(t.column_3, (SELECT column_3 FROM temp_null_join_values)) = COALESCE(agg.column_3, (SELECT column_3 FROM temp_null_join_values))
);
我希望这有助于某人
答案 2 :(得分:0)
怎么样:
SELECT
t.record_id,
a.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
NULLIF(t.column_1, agg.column_1) IS NULL
AND
NULLIF(agg.column_1, t.column_1) IS NULL
AND
NULLIF(t.column_2, agg.column_2) IS NULL
AND
NULLIF(agg.column_2, t.column_2) IS NULL
AND
NULLIF(t.column_3, agg.column_3) IS NULL
AND
NULLIF(agg.column_3, t.column_3) IS NULL
);