这不是您的标准“我如何找到重复”的问题,我知道如何查找重复项,请参阅下文。这个问题是如何更新所有具有匹配记录的子项目的记录?
好吧,我将给你整个场景,以便你可以解决这个问题。
由于关键系统故障,可能会插入重复记录。
查找稍后的重复项并标记父委员会Commission_import_commission_junction“is_processed = True”解决了这个问题。
复杂的是,在要比较的列上,commission_import_commission_junction及其子级commission_import_commission_junction_line_items必须相同。
表格是:
commission_import_commission_junction
- id
- created_date
- some columns that are checked for duplication
- some columns that are not checked for duplication
commission_import_commission_junction_line_items
- id
- some columns that are checked for duplication
- some columns that are not checked for duplication
(对于完整的表规范,请查看最底部代码块中的CREATE TABLE语句。)
仅在父表上标记重复项的查询 commission_import_commission_junction:
UPDATE commission_import_commission_junction cicj
SET is_processed = TRUE
FROM (
SELECT MIN(created_date) AS first_date, member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
FROM commission_import_commission_junction inner_imports
JOIN commission_import_commission_junction_line_items inner_items ON inner_items.commission_import_commission_junction_id = inner_imports.commission_import_commission_junction_id
GROUP BY member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
HAVING (COUNT(*) > 1)
) AS dups
WHERE
-- MAIN TABLE COLUMNN LIST
(cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
-- OTHER TABLE COLUMN LIST
(dups.member_id, dups.site_id, dups.action_status, dups.action_type, dups.ad_id, dups.commission_id, dups.country, dups.event_date, dups.locking_date, dups.order_id, dups.original, dups.original_action_id, dups.posting_date, dups.website_id, dups.advertiser_name, dups.commission_amount, dups.sale_amount, dups.aggregator_affiliate_id)
AND cicj.created_date <> dups.first_date
AND cicj.is_processed = FALSE;
在某处,某种程度上我需要检查line_items是否也是重复的。
下面的代码是设置数据库,请记住这是postgres特定的。
-- "commission_import_build" is a record that keeps information about the process of collecting the commission information. Duplicate commission_import_commission_junction records will not exist with the same commission_import_build_id
-- "commission_import_commission_junction" is a record description commission information from a customers purchase.
-- "commission_import_commission_junction_line_items" are records describing items in that purchase.
DROP TABLE IF EXISTS commission_import_commission_junction_line_items;
DROP TABLE IF EXISTS commission_import_commission_junction;
DROP TABLE IF EXISTS commission_import_builds;
CREATE TABLE commission_import_builds
(
commission_import_build_id serial NOT NULL,
build_date timestamp with time zone NOT NULL,
CONSTRAINT pkey_commission_import_build_id PRIMARY KEY (commission_import_build_id),
CONSTRAINT commission_import_builds_build_date_key UNIQUE (build_date)
);
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (1, '2011-01-01');
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (2, '2011-01-02');
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (3, '2011-01-03');
CREATE TABLE commission_import_commission_junction
(
commission_import_commission_junction_id serial NOT NULL,
member_id integer,
site_id integer,
action_status character varying NOT NULL,
action_type character varying NOT NULL,
ad_id bigint,
commission_id bigint NOT NULL,
country character varying,
event_date timestamp with time zone NOT NULL,
locking_date timestamp with time zone,
order_id character varying NOT NULL,
original boolean,
original_action_id bigint NOT NULL,
posting_date timestamp with time zone NOT NULL,
website_id bigint NOT NULL,
advertiser_name character varying,
commission_amount numeric(19,2) NOT NULL,
sale_amount numeric(19,2) NOT NULL,
aggregator_affiliate_id integer NOT NULL,
is_processed boolean NOT NULL DEFAULT false,
created_date timestamp with time zone NOT NULL DEFAULT now(),
member_transaction_id integer,
commission_import_build_id integer NOT NULL,
CONSTRAINT pkey_commission_import_commission_junction_commission_import_co PRIMARY KEY (commission_import_commission_junction_id),
CONSTRAINT fk_commission_import_commission_junction_commission_import_buil FOREIGN KEY (commission_import_build_id)
REFERENCES commission_import_builds (commission_import_build_id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE INDEX idx_commission_import_commission_junction_is_processed
ON commission_import_commission_junction
USING btree
(is_processed);
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
(1, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 1, '2011-02-05');
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
(2, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 2, '2011-02-06');
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
(3, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 3, '2011-02-07');
SELECT * FROM commission_import_commission_junction;
CREATE TABLE commission_import_commission_junction_line_items
(
commission_import_commission_junction_line_item_id serial NOT NULL,
commission_import_commission_junction_id integer NOT NULL,
sku character varying,
quantity integer,
posting_date timestamp with time zone,
sale_amount numeric(19,2),
discount numeric(19,2),
CONSTRAINT pkey_commission_import_commission_junction_link_items_commissio PRIMARY KEY (commission_import_commission_junction_line_item_id),
CONSTRAINT fkey_commission_import_commission_junction_line_items_commissio FOREIGN KEY (commission_import_commission_junction_id)
REFERENCES commission_import_commission_junction (commission_import_commission_junction_id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test3', 3, 32.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test3', 3, 32.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test3', 3, 32.50);
答案 0 :(得分:0)
让我想起直接营销邮件列表中的重复删除
无论表格的详细信息如何,父子欺骗消除算法都遵循以下步骤:
1)将重复项复制到一个列表中,该列表将旧密钥与新密钥(临时表)匹配
2)更新子表中的外键
3)从父母
中删除欺骗我很欣赏你帖子中的细节,但是我会用一些示例表/列名来简单易读:
-- step 1, get the list
-- Warning: t-sql syntax, adjust for Postgres
-- if it doesn't like placement of "into..." clause
select keep.primaryKey as keepKey
, dupe.primaryKey as dupeKey
into #DupeList
from (
select min(primaryKey) as primaryKey
, dupeCriteria1
, dupeCriteria2
FROM theTable
group by dupeCriteria1,dupeCritera2
having count(*) > 1
) keep
JOIN theTable dupe
ON keep.dupeCriteria1 = dupe.dupeCriteria1
AND keep.dupeCriteria2 = dupe.dupeCriteria2
AND keep.primaryKey <> dupe.primaryKey
完成后,更新子表中的外键:
update childTable
set foreignKey = #temp1.keepKey
from #temp1
where foreignKey = #temp1.dupeKey
然后只删除父表中的所有内容:
delete from parentTable
where primaryKey in (select dupeKey from #temp1)
答案 1 :(得分:0)
CREATE FUNCTION removeCommissionImportCommissionJunctionDuplicates() RETURNS INT AS $BODY$ DECLARE duplicate RECORD; DECLARE parent RECORD; DECLARE children commission_import_commission_junction_line_items[]; DECLARE duplicate_children commission_import_commission_junction_line_items[]; DECLARE duplicate_child_count INT; DECLARE child commission_import_commission_junction_line_items; DECLARE duplicate_child commission_import_commission_junction_line_items; DECLARE num_updates INT; BEGIN
SELECT * FROM (SELECT 0) AS value INTO num_updates;
FOR duplicate IN
SELECT cicj.*, dups.first_date
FROM commission_import_commission_junction cicj
JOIN (SELECT MIN(created_date) AS first_date, member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
FROM commission_import_commission_junction inner_imports
GROUP BY member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
HAVING (COUNT(*) > 1)) AS dups
ON (cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
(dups.member_id, dups.site_id, dups.action_status, dups.action_type, dups.ad_id, dups.commission_id, dups.country, dups.event_date, dups.locking_date, dups.order_id, dups.original, dups.original_action_id, dups.posting_date, dups.website_id, dups.advertiser_name, dups.commission_amount, dups.sale_amount, dups.aggregator_affiliate_id)
WHERE cicj.created_date != dups.first_date
AND cicj.is_processed = FALSE
LOOP
--RAISE NOTICE 'Looping';
-- We need to collect the parent and children of the original record.
-- Get the parent of the original
SELECT *
FROM commission_import_commission_junction cicj
WHERE (cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
(duplicate.member_id, duplicate.site_id, duplicate.action_status, duplicate.action_type, duplicate.ad_id, duplicate.commission_id, duplicate.country, duplicate.event_date, duplicate.locking_date, duplicate.order_id, duplicate.original, duplicate.original_action_id, duplicate.posting_date, duplicate.website_id, duplicate.advertiser_name, duplicate.commission_amount, duplicate.sale_amount, duplicate.aggregator_affiliate_id)
AND cicj.created_date = duplicate.first_date
INTO parent;
-- Get the children of the original
children := ARRAY(
SELECT cicjli
FROM commission_import_commission_junction_line_items cicjli
WHERE cicjli.commission_import_commission_junction_id
= parent.commission_import_commission_junction_id);
--RAISE NOTICE 'parent: %', parent;
--RAISE NOTICE 'children: %', children;
-- Now get the duplicates children
duplicate_children := ARRAY(
SELECT cicjli
FROM commission_import_commission_junction_line_items cicjli
WHERE cicjli.commission_import_commission_junction_id
= duplicate.commission_import_commission_junction_id);
--RAISE NOTICE 'duplicate_children: %', duplicate_children;
-- Next, compare the children of the duplicate to the children of the original parent.
-- First compare size
IF array_upper(children, 1) = array_upper(duplicate_children, 1) THEN
--RAISE NOTICE 'Same number of children in duplicate as in parent';
-- Now compare each set
SELECT * FROM (SELECT 0) AS value INTO duplicate_child_count;
FOR child_index IN array_lower(children, 1) .. array_upper(children, 1) LOOP
child := children[child_index];
FOR duplicate_child_index IN array_lower(duplicate_children, 1) .. array_upper(duplicate_children, 1) LOOP
duplicate_child := duplicate_children[duplicate_child_index];
IF (child.sku, child.quantity, child.posting_date, child.sale_amount, child.discount) IS NOT DISTINCT FROM (duplicate_child.sku, duplicate_child.quantity, duplicate_child.posting_date, duplicate_child.sale_amount, duplicate_child.discount) THEN
SELECT * FROM (SELECT duplicate_child_count + 1) AS value INTO duplicate_child_count;
EXIT;
END IF;
END LOOP;
END LOOP;
--RAISE NOTICE 'Duplicate Child Count: %', duplicate_child_count;
-- If we have the same number of duplicates as there are records
IF duplicate_child_count = array_upper(duplicate_children, 1) THEN
-- Update the duplicate record as processed.
--RAISE NOTICE 'Marking duplicate % as is_processed', duplicate;
UPDATE commission_import_commission_junction cicj SET is_processed = TRUE WHERE cicj.commission_import_commission_junction_id
= duplicate.commission_import_commission_junction_id;
SELECT * FROM (SELECT num_updates + 1) AS value INTO num_updates;
END IF;
END IF;
END LOOP;
--RAISE NOTICE 'Updates: %', num_updates;
RETURN num_updates; END; $BODY$ LANGUAGE plpgsql;