从一对多关系更新重复项。

时间:2011-02-05 00:43:16

标签: postgresql duplicates one-to-many

这不是您的标准“我如何找到重复”的问题,我知道如何查找重复项,请参阅下文。这个问题是如何更新所有具有匹配记录的子项目的记录?

好吧,我将给你整个场景,以便你可以解决这个问题。

由于关键系统故障,可能会插入重复记录。

查找稍后的重复项并标记父委员会Commission_import_commission_junction“is_processed = True”解决了这个问题。

复杂的是,在要比较的列上,commission_import_commission_junction及其子级commission_import_commission_junction_line_items必须相同。

表格是:

commission_import_commission_junction
 - id
 - created_date
 - some columns that are checked for duplication
 - some columns that are not checked for duplication
commission_import_commission_junction_line_items
 - id
 - some columns that are checked for duplication
 - some columns that are not checked for duplication

(对于完整的表规范,请查看最底部代码块中的CREATE TABLE语句。)

仅在父表上标记重复项的查询 commission_import_commission_junction:

UPDATE commission_import_commission_junction cicj
SET is_processed = TRUE
FROM (
    SELECT MIN(created_date) AS first_date, member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
    FROM commission_import_commission_junction inner_imports
    JOIN commission_import_commission_junction_line_items inner_items ON inner_items.commission_import_commission_junction_id = inner_imports.commission_import_commission_junction_id
    GROUP BY member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
    HAVING (COUNT(*) > 1)
) AS dups
WHERE
-- MAIN TABLE COLUMNN LIST
(cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
-- OTHER TABLE COLUMN LIST
(dups.member_id, dups.site_id, dups.action_status, dups.action_type, dups.ad_id, dups.commission_id, dups.country, dups.event_date, dups.locking_date, dups.order_id, dups.original, dups.original_action_id, dups.posting_date, dups.website_id, dups.advertiser_name, dups.commission_amount, dups.sale_amount, dups.aggregator_affiliate_id)
AND cicj.created_date <> dups.first_date
AND cicj.is_processed = FALSE;

在某处,某种程度上我需要检查line_items是否也是重复的。

下面的代码是设置数据库,请记住这是postgres特定的。

-- "commission_import_build" is a record that keeps information about the process of collecting the commission information. Duplicate commission_import_commission_junction records will not exist with the same commission_import_build_id
-- "commission_import_commission_junction" is a record description commission information from a customers purchase.
-- "commission_import_commission_junction_line_items" are records describing items in that purchase.


DROP TABLE IF EXISTS commission_import_commission_junction_line_items;
DROP TABLE IF EXISTS commission_import_commission_junction;
DROP TABLE IF EXISTS commission_import_builds;

CREATE TABLE commission_import_builds
(
  commission_import_build_id serial NOT NULL,
  build_date timestamp with time zone NOT NULL,
  CONSTRAINT pkey_commission_import_build_id PRIMARY KEY (commission_import_build_id),
  CONSTRAINT commission_import_builds_build_date_key UNIQUE (build_date)
);
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (1, '2011-01-01');
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (2, '2011-01-02');
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (3, '2011-01-03');

CREATE TABLE commission_import_commission_junction
(
  commission_import_commission_junction_id serial NOT NULL,
  member_id integer,
  site_id integer,
  action_status character varying NOT NULL,
  action_type character varying NOT NULL,
  ad_id bigint,
  commission_id bigint NOT NULL,
  country character varying,
  event_date timestamp with time zone NOT NULL,
  locking_date timestamp with time zone,
  order_id character varying NOT NULL,
  original boolean,
  original_action_id bigint NOT NULL,
  posting_date timestamp with time zone NOT NULL,
  website_id bigint NOT NULL,
  advertiser_name character varying,
  commission_amount numeric(19,2) NOT NULL,
  sale_amount numeric(19,2) NOT NULL,
  aggregator_affiliate_id integer NOT NULL,
  is_processed boolean NOT NULL DEFAULT false,
  created_date timestamp with time zone NOT NULL DEFAULT now(),
  member_transaction_id integer,
  commission_import_build_id integer NOT NULL,
  CONSTRAINT pkey_commission_import_commission_junction_commission_import_co PRIMARY KEY (commission_import_commission_junction_id),
  CONSTRAINT fk_commission_import_commission_junction_commission_import_buil FOREIGN KEY (commission_import_build_id)
      REFERENCES commission_import_builds (commission_import_build_id) MATCH SIMPLE
      ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE INDEX idx_commission_import_commission_junction_is_processed
  ON commission_import_commission_junction
  USING btree
  (is_processed);
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
  (1, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 1, '2011-02-05');
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
  (2, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 2, '2011-02-06');
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
  (3, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 3, '2011-02-07');
SELECT * FROM commission_import_commission_junction;

CREATE TABLE commission_import_commission_junction_line_items
(
  commission_import_commission_junction_line_item_id serial NOT NULL,
  commission_import_commission_junction_id integer NOT NULL,
  sku character varying,
  quantity integer,
  posting_date timestamp with time zone,
  sale_amount numeric(19,2),
  discount numeric(19,2),
  CONSTRAINT pkey_commission_import_commission_junction_link_items_commissio PRIMARY KEY (commission_import_commission_junction_line_item_id),
  CONSTRAINT fkey_commission_import_commission_junction_line_items_commissio FOREIGN KEY (commission_import_commission_junction_id)
      REFERENCES commission_import_commission_junction (commission_import_commission_junction_id) MATCH SIMPLE
      ON UPDATE NO ACTION ON DELETE NO ACTION
);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test3', 3, 32.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test3', 3, 32.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test3', 3, 32.50);

2 个答案:

答案 0 :(得分:0)

让我想起直接营销邮件列表中的重复删除

无论表格的详细信息如何,父子欺骗消除算法都遵循以下步骤:

1)将重复项复制到一个列表中,该列表将旧密钥与新密钥(临时表)匹配

2)更新子表中的外键

3)从父母

中删除欺骗

我很欣赏你帖子中的细节,但是我会用一些示例表/列名来简单易读:

-- step 1, get the list
-- Warning: t-sql syntax, adjust for Postgres
--          if it doesn't like placement of "into..." clause
select keep.primaryKey as keepKey
     , dupe.primaryKey as dupeKey
  into #DupeList
  from (
        select min(primaryKey) as primaryKey
             , dupeCriteria1
             , dupeCriteria2
          FROM theTable
         group by dupeCriteria1,dupeCritera2
        having count(*) > 1
       ) keep
  JOIN theTable dupe
    ON keep.dupeCriteria1 = dupe.dupeCriteria1
   AND keep.dupeCriteria2 = dupe.dupeCriteria2
   AND keep.primaryKey   <> dupe.primaryKey

完成后,更新子表中的外键:

update childTable
   set foreignKey = #temp1.keepKey
  from #temp1
 where foreignKey = #temp1.dupeKey

然后只删除父表中的所有内容:

delete from parentTable
 where primaryKey in (select dupeKey from #temp1)

答案 1 :(得分:0)

CREATE FUNCTION removeCommissionImportCommissionJunctionDuplicates() RETURNS INT AS $BODY$ DECLARE duplicate RECORD; DECLARE parent RECORD; DECLARE children commission_import_commission_junction_line_items[]; DECLARE duplicate_children commission_import_commission_junction_line_items[]; DECLARE duplicate_child_count INT; DECLARE child commission_import_commission_junction_line_items; DECLARE duplicate_child commission_import_commission_junction_line_items; DECLARE num_updates INT; BEGIN
    SELECT * FROM (SELECT 0) AS value INTO num_updates;
    FOR duplicate IN
        SELECT cicj.*, dups.first_date
        FROM commission_import_commission_junction cicj
        JOIN (SELECT MIN(created_date) AS first_date, member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
            FROM commission_import_commission_junction inner_imports
            GROUP BY member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
            HAVING (COUNT(*) > 1)) AS dups
        ON (cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
        IS NOT DISTINCT FROM
        (dups.member_id, dups.site_id, dups.action_status, dups.action_type, dups.ad_id, dups.commission_id, dups.country, dups.event_date, dups.locking_date, dups.order_id, dups.original, dups.original_action_id, dups.posting_date, dups.website_id, dups.advertiser_name, dups.commission_amount, dups.sale_amount, dups.aggregator_affiliate_id)
        WHERE cicj.created_date != dups.first_date
        AND cicj.is_processed = FALSE
    LOOP
        --RAISE NOTICE 'Looping';
        -- We need to collect the parent and children of the original record.
        -- Get the parent of the original
        SELECT * 
        FROM commission_import_commission_junction cicj 
        WHERE (cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
        IS NOT DISTINCT FROM
        (duplicate.member_id, duplicate.site_id, duplicate.action_status, duplicate.action_type, duplicate.ad_id, duplicate.commission_id, duplicate.country, duplicate.event_date, duplicate.locking_date, duplicate.order_id, duplicate.original, duplicate.original_action_id, duplicate.posting_date, duplicate.website_id, duplicate.advertiser_name, duplicate.commission_amount, duplicate.sale_amount, duplicate.aggregator_affiliate_id)
        AND cicj.created_date = duplicate.first_date
        INTO parent;
        -- Get the children of the original
        children := ARRAY(
            SELECT cicjli
            FROM commission_import_commission_junction_line_items cicjli
            WHERE cicjli.commission_import_commission_junction_id
= parent.commission_import_commission_junction_id);
        --RAISE NOTICE 'parent: %', parent;
        --RAISE NOTICE 'children: %', children;
        -- Now get the duplicates children
        duplicate_children := ARRAY(
            SELECT cicjli
            FROM commission_import_commission_junction_line_items cicjli
            WHERE cicjli.commission_import_commission_junction_id
= duplicate.commission_import_commission_junction_id);
        --RAISE NOTICE 'duplicate_children: %', duplicate_children;
        -- Next, compare the children of the duplicate to the children of the original parent.
        -- First compare size
        IF array_upper(children, 1) = array_upper(duplicate_children, 1) THEN
            --RAISE NOTICE 'Same number of children in duplicate as in parent';
            -- Now compare each set
            SELECT * FROM (SELECT 0) AS value INTO duplicate_child_count;
            FOR child_index IN array_lower(children, 1) .. array_upper(children, 1) LOOP
                child := children[child_index];
                FOR duplicate_child_index IN array_lower(duplicate_children, 1) .. array_upper(duplicate_children, 1) LOOP
                    duplicate_child := duplicate_children[duplicate_child_index];
                    IF (child.sku, child.quantity, child.posting_date, child.sale_amount, child.discount) IS NOT DISTINCT FROM (duplicate_child.sku, duplicate_child.quantity, duplicate_child.posting_date, duplicate_child.sale_amount, duplicate_child.discount) THEN
                        SELECT * FROM (SELECT duplicate_child_count + 1) AS value INTO duplicate_child_count;
                        EXIT;
                    END IF;
                END LOOP;
            END LOOP;
            --RAISE NOTICE 'Duplicate Child Count: %', duplicate_child_count;
            -- If we have the same number of duplicates as there are records
            IF duplicate_child_count = array_upper(duplicate_children, 1) THEN
                -- Update the duplicate record as processed.
                --RAISE NOTICE 'Marking duplicate % as is_processed', duplicate;
                UPDATE commission_import_commission_junction cicj SET is_processed = TRUE WHERE cicj.commission_import_commission_junction_id
= duplicate.commission_import_commission_junction_id;
                SELECT * FROM (SELECT num_updates + 1) AS value INTO num_updates;
            END IF;
        END IF;
    END LOOP;
    --RAISE NOTICE 'Updates: %', num_updates;
    RETURN num_updates; END; $BODY$ LANGUAGE plpgsql;