我试图将以下SQL Server查询转换为查询的GreenPlum版本:
INSERT INTO #TMP1 (part_id, file_id, location, measure_date)
SELECT DISTINCT
pt.part_id, qf.file_id, qf.edl_desc, pt.measure_date
FROM
part pt WITH (NOLOCK)
INNER JOIN
file_model qm with (nolock) on qm.file_model_id = pt.file_model_id
INNER JOIN
file qf with (nolock) on qf.file_id = qm.file_id;
INSERT INTO @part_list (file_id, part_id, measure_date)
SELECT DISTINCT
t1.file_id, k.part_id, k.measure_date
FROM
#TMP1 t1 WITH (NOLOCK)
CROSS APPLY
(SELECT DISTINCT TOP (300)
t2.part_id, t2.measure_date
FROM
#TMP1 t2 WITH (NOLOCK)
WHERE
t1.file_id = t2.file_id and t1.location = t2.location
ORDER BY
t2.measure_date DESC) k
WHERE
t1.measure_date >= dateadd(day, 30, getdate());
这里的想法是,最终表格包含最近300个零件,用于过去30天内所有零件程序的活动(即制造的东西)。
Per the answers to this question,我知道LATERAL JOIN会这样做,除了我的组织使用的是没有LATERAL的旧版Postgres,所以我只能实现以下功能:
CREATE FUNCTION BuildActiveParts(p_day INT, p_n INT)
RETURNS SETOF RECORD --TABLE (part_id bigint,file_id int, measure_date timestamp, location varchar(255))
AS $$
DECLARE
part_active RECORD;
part_list RECORD;
BEGIN
FOR part_active IN
SELECT DISTINCT qf.file_id, qf.location
FROM part pt
INNER JOIN file_model qm on qm.file_model_id = pt.file_model_id
INNER JOIN file qf on qf.file_id = qm.file_id WHERE pt.measure_date >= current_date - p_day LOOP
FOR part_list IN
SELECT DISTINCT pt.part_id, qf.file_id, pt.measure_date, qf.location
FROM part pt
INNER JOIN file_model qm on qm.file_model_id = pt.file_model_id
INNER JOIN file qf on qf.file_id = qm.file_id WHERE qf.file_id = part_active.file_id
AND qf.location = part_active.location
ORDER BY pt.measure_date DESC LIMIT p_n LOOP
RETURN NEXT part_list;
END LOOP;
END LOOP;
END
$$ LANGUAGE plpgsql;
-- Later used in:
--Build list of all active programs in last p_day days. This temporary table is a component of a larger function that produces a table based on this and other other calculations, called daily.
-- Note: this insert yields 'function cannot execute because it accesses relation'
INSERT INTO TMP_part_list ( part_id, file_id, measure_date, location)
SELECT DISTINCT * FROM BuildActiveParts(p_day, p_n) AS active_parts (part_id int, file_id text, measure_date timestamp, location text )
;
不幸的是,这个函数用于插入到另一个表中(这是我业务需求中不可避免的现实),所以当函数在隔离运行时返回很好的快乐结果时,当我尝试时,我会感到很生气function cannot execute on segment because it accesses relation
将其用于预期目的。虽然我已经看到有关#34;反而使用VIEW的效果的建议,但这并不是一个选项,因为脚本产生的这种功能属于其中的一部分会花费太长时间查询。
我能做些什么,除了通过繁文缛节进行长达数月的短途旅行以说服我的组织更新他们的东西,以解决这个问题?
编辑:以下是基于评论的一些尝试:
尝试使用功能,因function cannot execute on segment because it accesses relation
:
DROP FUNCTION IF EXISTS BuildRecentParts(TEXT, TEXT, INT);
CREATE FUNCTION BuildRecentParts(file_id TEXT, location_in TEXT, p_n INT)
RETURNS SETOF RECORD --TABLE (measure_date timestamp, part_id bigint)
AS $$
DECLARE
part_list RECORD;
BEGIN
FOR part_list IN
SELECT DISTINCT pt.measure_date, pt.part_id
FROM part pt
INNER JOIN file_model qm on qm.file_model_id = pt.file_model_id
INNER JOIN file qf on qf.file_id = qm.file_id
WHERE qf.file_id = file_id
AND qf.edl_desc = location_in
ORDER BY pt.measure_date DESC LIMIT p_n LOOP
RETURN NEXT part_list;
END LOOP;
END
$$ LANGUAGE plpgsql;
SELECT DISTINCT qf.file_id, qf.edl_desc, (SELECT pti.measure_date, pti.part_id FROM part pti
INNER JOIN file_model qmi on qmi.file_model_id = pti.file_model_id
INNER JOIN file qfi on qfi.file_id = qmi.file_id
WHERE qfi.file_id = qf.file_id
AND qfi.edl_desc = qf.edl_desc
ORDER BY pti.measure_date DESC LIMIT 300)
FROM part pt
INNER JOIN file_model qm on qm.file_model_id = pt.file_model_id
INNER JOIN file qf on qf.file_id = qm.file_id
WHERE pt.measure_date >= current_date - 30 ;
尝试不使用函数,因为子查询有多个列而不起作用:
CREATE TEMPORARY TABLE TMP_TMP1 (part_id bigint, file_id varchar(255), location varchar(255), measure_date timestamp) DISTRIBUTED BY (part_id);
INSERT INTO TMP_TMP1 (part_id, file_id, location, measure_date)
SELECT DISTINCT pt.part_id, qf.file_id, qf.edl_desc, pt.measure_date
FROM part pt
INNER JOIN file_model qm on qm.file_model_id = pt.file_model_id
INNER JOIN file qf on qf.file_id = qm.file_id;
ANALYZE TMP_TMP1;
SELECT DISTINCT t1.file_id, t1.location, (SELECT t2.measure_date, t2.part_id FROM TMP_TMP1 t2
WHERE t2.file_id = t1.file_id
AND t2.location = t1.location
ORDER BY t2.measure_date DESC LIMIT 300)
FROM TMP_TMP1 t1
WHERE t1.measure_date >= current_date - 30;
我还尝试了一种递归CTE,但发现它不受支持。
答案 0 :(得分:0)
好吧,Greenplum没有脏读,所以你无法实现你所拥有的nolock提示。这也许是一件好事。我建议也从SQL Server中删除它。
我认为最好的解决方案是在这里使用Analytical函数而不是该函数,甚至是Greenplum支持的相关子查询。在SQL Server中使用这种方法也更有效。
SELECT sub2.part_id, sub2.location, sub2.measure_date
FROM (
SELECT sub1.part_id, sub1.location, sub1.measure_date, row_number() over(partition by sub1.part_id order by sub1.measure_date desc) as rownum
FROM (
SELECT pt.part_id, qf.edl_desc as location, pt.measure_date
FROM part pt
INNER JOIN file_model qm on qm.file_model_id = pt.file_model_id
INNER JOIN file qf on qf.file_id = qm.file_id
WHERE pt.measure_date >= (now() - interval '30 days')
GROUP BY pt.part_id, qf.edl_desc, pt.measure_date
) AS sub1
) as sub2
WHERE sub2.rownum <= 300;
现在,我不得不猜测你的数据,因为如果你有多个qf.qcc_file_desc值,你的原始查询可能会遇到麻烦,因为你的原始分组包含了这个。如果你有多个价值观,那么事情会变得很难看。
我也不是100%肯定row_number函数而不知道你的数据。可能是这样:
row_number() over(partition by sub1.part_id, sub1.location order by sub1.measure_date desc)
答案 1 :(得分:0)
在这里和我组织的建筑师之间的答案之间,我们认为我们已经达到了GreenPlum限制,这个限制太难以克服,执行交叉连接的逻辑将转移到调用存储过程的R脚本这个功能将成为其中的一部分。