如何总结所有可能的变量组合?

时间:2017-10-19 19:38:26

标签: sql sql-server r sas teradata

我试图根据所有可能的变量组合来总结计数。这是一个示例数据: enter image description here

3 个答案:

答案 0 :(得分:3)

对于使用某些内置聚合工具的此类查询非常简单。

首先根据您的样本图像设置一些样本数据:

declare @Table1 as table
    ([id] int, [a] int, [b] int, [c] int)
;

INSERT INTO @Table1
    ([id], [a], [b], [c])
VALUES
    (10001, 1, 3, 3),
    (10002, 0, 0, 0),
    (10003, 3, 6, 0),
    (10004, 7, 0, 0),
    (10005, 0, 0, 0)
;

由于您需要非零属性A,B和C的每种可能组合的ID计数,因此第一步是消除零并将非零值转换为我们可以概括的单个值,在这种情况下我将使用属性名称。之后,执行聚合很简单,使用group by语句中的CUBE子句生成组合。最后在having子句中删除了不需要的求和。大多数情况下,这只是忽略属性中的空值,并可选择删除大摘要(所有行的计数)

with t1 as (
select case a when 0 then null else 'a' end a
     , case b when 0 then null else 'b' end b
     , case c when 0 then null else 'c' end c
     , id
  from @Table1
)
select a, b, c, count(id) cnt
  from t1
  group by cube(a,b,c)
  having (a is not null or grouping(a) = 1) -- For each attribute
     and (b is not null or grouping(b) = 1) -- only allow nulls as
     and (c is not null or grouping(c) = 1) -- a result of grouping.
     and grouping_id(a,b,c) <> 7  -- exclude the grand total
  order by grouping_id(a,b,c);

结果如下:

    a       b       c       cnt
1   a       b       c       1
2   a       b       NULL    2
3   a       NULL    c       1
4   a       NULL    NULL    3
5   NULL    b       c       1
6   NULL    b       NULL    2
7   NULL    NULL    c       1

最后我的原始rextester链接:http://rextester.com/YRJ10544

@ lad2025这是一个动态版本(抱歉,我的SQL Server技能不如我的Oracle技能强,但它有效)。只需为@Table和@col设置正确的值,只要所有其他列都是数字属性,它就应该有效:

declare @sql varchar(max), @table varchar(30), @col varchar(30);
set @table = 'Table1';
set @col = 'id';
with x(object_id, column_id, name, names, proj, pred, max_col, cnt) 
  as (
    select object_id, column_id, name, cast(name as varchar(max))
     , cast('case '+name+' when 0 then null else '''+name+''' end '+name as varchar(4000))
     , cast('('+name+' is not null or grouping('+name+') = 1)' as varchar(4000))
     , (select max(column_id) from sys.columns m where m.object_id = c.object_id and m.name <>'ID')
     , 1
     from sys.columns c
    where object_id = OBJECT_ID(@Table)
      and column_id = (select min(column_id) from sys.columns m where m.object_id = c.object_id and m.name <> @col)
    union all
    select x.object_id, c.column_id, c.name, cast(x.names+', '+c.name as varchar(max))
     , cast(proj+char(13)+char(10)+'     , case '+c.name+' when 0 then null else '''+c.name+''' end '+c.name as varchar(4000))
     , cast(pred+char(13)+char(10)+'   and ('+c.name+' is not null or grouping('+c.name+') = 1)' as varchar(4000))
     , max_col
     , cnt+1
      from x join sys.columns c on c.object_id = x.object_id and c.column_id = x.column_id+1
)
select @sql='with t1 as (
select '+proj+'
     , '+@col+'
  from '+@Table+'
)
select '+names+'
     , count('+@col+') cnt 
  from t1
 group by cube('+names+')
having '+pred+'
   and grouping_id('+names+') <> '+cast(power(2,cnt)-1 as varchar(10))+'
 order by grouping_id('+names+');'
  from x where column_id = max_col;

select @sql sql;
exec (@sql);

<强> Rextester

答案 1 :(得分:1)

破山:

正如Robert所说,SUMMARY可用于计算组合。第二个SUMMARY可以计算所计算的类型。一个难点是忽略涉及零值的组合。如果它们可以转换为缺失,则处理更加清晰。假设零转换为缺失,此代码将计算不同的组合:

proc summary noprint data=have;
  class v2-v4 s1;
  output out=counts_eachCombo;
run;

proc summary noprint data=counts_eachCombo(rename=_type_=combo_type);
  class combo_type;
  output out=counts_eachClassType;
run;

您可以看到组合中CLASS变量的使用如何确定 TYPE ,并且类变量可以是混合类型(数字,字符)

不使用SUMMARY的不同“本土”方法可以使用LEXCOMB的数据步骤来计算每个组合,并使用in / separate来生成SQL语句,以便对每个组合进行明确计算。

注意:以下代码包含用于将SAS variable list解析为单个变量名称的宏varListEval。

%macro makeHave(n=,m=,maxval=&m*4,prob0=0.25);

  data have;
    do id = 1 to &n;
      array v v1-v&m;
      do over v;
        if ranuni(123) < &prob0 then v = 0; else v = ceil(&maxval*ranuni(123));
      end;
      s1 = byte(65+5*ranuni(123));
      output;
    end;
  run;

%mend;

%makeHave (n=100,m=5,maxval=15)

%macro varListEval (data=, var=);
  %* resolve a SAS variable list to individual variable names;
  %local dsid dsid2 i name num;
  %let dsid = %sysfunc(open(&data));
  %if &dsid %then %do;
    %let dsid2 = %sysfunc(open(&data(keep=&var)));
    %if &dsid2 %then %do;
      %do i = 1 %to %sysfunc(attrn(&dsid,nvar));
        %let name = %sysfunc(varname(&dsid,&i));
        %let num = %sysfunc(varnum(&dsid2,&name));
        %if &num %then "&NAME";
      %end;
      %let dsid2 = %sysfunc(close(&dsid2));
    %end;
    %let dsid = %sysfunc(close(&dsid));
  %end;
  %else
    %put %sysfunc(sysmsg());
%mend;

%macro combosUCounts(data=, var=);
  %local vars n;
  %let vars = %varListEval(data=&data, var=&var);

  %let n = %eval(1 + %sysfunc(count(&vars,%str(" ")));

  * compute combination selectors and criteria;
  data combos;
    array _names (&n) $32 (&vars);
    array _combos (&n) $32;
    array _comboCriterias (&n) $200;

    length _selector $32000;
    length _criteria $32000;

    if 0 then set &data; %* prep PDV for vname;

    do _k = 1 to &n;
      do _j = 1 to comb(&n,_k);
        _rc = lexcomb(_j,_k, of _names[*]);
        do _p = 1 to _k;
          _combos(_p) = _names(_p);
          if vtypex(_names(_p)) = 'C' 
            then _comboCriterias(_p) = trim(_names(_p)) || " is not null and " || trim(_names(_p)) || " ne ''";
            else _comboCriterias(_p) = trim(_names(_p)) || " is not null and " || trim(_names(_p)) || " ne 0";
        end;
        _selector = catx(",", of _combos:);
        _criteria = catx(" and ", of _comboCriterias:);
        output;
      end;
    end;

    stop;
  run;

  %local union;

  proc sql noprint;
    * generate SQL statement that uses combination selectors and criteria;
    select "select "
    || quote(trim(_selector))
    || " as combo" 
    || ", "
    || "count(*) as uCount from (select distinct "
    || trim(_selector)
    || " from &data where "
    || trim(_criteria)
    || ")"
    into :union separated by " UNION "
    from combos
    ;

    * perform the generated SQL statement;
    create table comboCounts as
    &union;

    /* %put union=%superq(union); */
  quit;
%mend;

options mprint nosymbolgen;
%combosUCounts(data=have, var=v2-v4);
%combosUCounts(data=have, var=v2-v4 s1);

%put NOTE: Done;
/*
data _null_;
put %varListEval(data=have, var=v2-v4) ;
run;
*/

答案 2 :(得分:0)

天真的方法SQL Server版本(我假设我们总是有3列,所以会有2 ^ 3-1行):

SELECT 'A' AS combination, COUNT(DISTINCT CASE WHEN a > 0 THEN a ELSE NULL END) AS cnt FROM t
UNION ALL 
SELECT 'B', COUNT(DISTINCT CASE WHEN b > 0 THEN a ELSE NULL END) FROM t
UNION ALL 
SELECT 'C', COUNT(DISTINCT CASE WHEN c > 0 THEN a ELSE NULL END) FROM t
UNION ALL
SELECT 'A,B', COUNT(DISTINCT CASE WHEN a > 0 THEN CAST(a AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN b > 0 THEN CAST(b AS VARCHAR(10)) ELSE NULL END) FROM t
UNION ALL
SELECT 'A,C', COUNT(DISTINCT CASE WHEN a > 0 THEN CAST(a AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN c > 0 THEN CAST(c AS VARCHAR(10)) ELSE NULL END) FROM t
UNION ALL
SELECT 'B,C', COUNT(DISTINCT CASE WHEN b > 0 THEN CAST(b AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN c > 0 THEN CAST(c AS VARCHAR(10)) ELSE NULL END) FROM t
UNION ALL
SELECT 'A,B,C', COUNT(DISTINCT CASE WHEN a > 0 THEN CAST(a AS VARCHAR(10)) ELSE NULL END 
                     + ',' + CASE WHEN b > 0 THEN CAST(b AS VARCHAR(10)) ELSE NULL END
                     + ',' + CASE WHEN c > 0 THEN CAST(c AS VARCHAR(10)) ELSE NULL END ) FROM t
ORDER BY combination 

<强> Rextester Demo

修改

与上述相同但更简洁:

WITH cte AS (
    SELECT ID
          ,CAST(NULLIF(a,0) AS VARCHAR(10)) a
          ,CAST(NULLIF(b,0) AS VARCHAR(10)) b
          ,CAST(NULLIF(c,0) AS VARCHAR(10)) c 
    FROM t
)
SELECT 'A' AS combination, COUNT(DISTINCT a) AS cnt FROM cte UNION ALL 
SELECT 'B', COUNT(DISTINCT b) FROM cte UNION ALL 
SELECT 'C', COUNT(DISTINCT c) FROM cte UNION ALL
SELECT 'A,B', COUNT(DISTINCT a + ',' + b) FROM cte UNION ALL
SELECT 'A,C', COUNT(DISTINCT a + ',' + c) FROM cte UNION ALL
SELECT 'B,C', COUNT(DISTINCT b + ',' + c) FROM cte UNION ALL
SELECT 'A,B,C', COUNT(DISTINCT a + ',' + b + ',' + c ) FROM cte ;

<强> Rextester Demo

编辑2

使用UNPIVOT

WITH cte AS (SELECT ID
               ,CAST(IIF(a!=0,1,NULL) AS VARCHAR(10)) a
               ,CAST(IIF(b!=0,1,NULL) AS VARCHAR(10)) b
               ,CAST(IIF(c!=0,1,NULL) AS VARCHAR(10)) c 
            FROM t)
SELECT combination, [count]
FROM (SELECT  a=COUNT(a), b=COUNT(b), c=COUNT(c)
           , ab=COUNT(a+b), ac=COUNT(a+c), bc=COUNT(b+c), abc=COUNT(a+b+c)
      FROM cte) s
UNPIVOT ([count] FOR combination IN (a,b,c,ab,ac,bc,abc))AS unpvt

<强> Rextester Demo

编辑最终方法

  

我感谢你的方法。 我的实际数据集中有3个以上的变量,您认为我们可以以编程方式生成所有可能的组合而不是硬编码吗?可能是您的第二种方法将覆盖:

执行此类操作时SQL有点笨拙,但我想表明它是可行的。

CREATE TABLE t(id INT, a INT, b INT, c INT);

INSERT INTO t
SELECT 10001,1,3,3 UNION
SELECT 10002,0,0,0 UNION
SELECT 10003,3,6,0 UNION
SELECT 10004,7,0,0 UNION
SELECT 10005,0,0,0;

DECLARE @Sample AS TABLE 
(
    item_id     tinyint IDENTITY(1,1) PRIMARY KEY NONCLUSTERED,
    item        nvarchar(500) NOT NULL,
    bit_value   AS  CONVERT ( integer, POWER(2, item_id - 1) )
                PERSISTED UNIQUE CLUSTERED
);    

INSERT INTO @Sample
SELECT name
FROM sys.columns
WHERE object_id = OBJECT_ID('t')
  AND name != 'id';

DECLARE @max integer = POWER(2, ( SELECT COUNT(*) FROM @Sample AS s)) - 1;
DECLARE @cols NVARCHAR(MAX);
DECLARE @cols_casted NVARCHAR(MAX);
DECLARE @cols_count NVARCHAR(MAX);


;WITH
  Pass0 as (select 1 as C union all select 1), --2 rows
  Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
  Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
  Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
  Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
  Tally as (select row_number() over(order by C) as n from Pass4)
, cte AS (SELECT
    combination =
        STUFF
        (
            (
                SELECT ',' + s.item 
                FROM @Sample AS s
                WHERE
                    n.n & s.bit_value = s.bit_value
                ORDER BY
                    s.bit_value
                FOR XML 
                    PATH (''),
                    TYPE                    
            ).value('(./text())[1]', 'varchar(8000)'), 1, 1, ''
        )
FROM Tally AS N
WHERE N.n BETWEEN 1 AND @max
)
SELECT @cols = STRING_AGG(QUOTENAME(combination),',')
      ,@cols_count = STRING_AGG(FORMATMESSAGE('[%s]=COUNT(DISTINCT %s)'
                    ,combination,REPLACE(combination, ',', ' + '','' +') ),',')
FROM cte;

SELECT 
  @cols_casted = STRING_AGG(FORMATMESSAGE('CAST(NULLIF(%s,0) AS VARCHAR(10)) %s'
                 ,name, name), ',')
FROM sys.columns
WHERE object_id = OBJECT_ID('t')
  AND name != 'id';

DECLARE @sql NVARCHAR(MAX);

SET @sql =
'SELECT combination, [count]
FROM (SELECT  <cols_count>
      FROM (SELECT ID, <cols_casted> FROM t )cte) s
UNPIVOT ([count] FOR combination IN (<cols>))AS unpvt';

SET @sql = REPLACE(@sql, '<cols_casted>', @cols_casted);
SET @sql = REPLACE(@sql, '<cols_count>', @cols_count);
SET @sql = REPLACE(@sql, '<cols>', @cols);

SELECT @sql;
EXEC (@sql);

<强> DBFiddle Demo

<强> DBFiddle Demo with 4 variables