查找从一个SAS表到另一表的最近值

时间:2019-02-16 16:13:16

标签: arrays sas lookup

无法在SAS中解决此问题,这是我的数据:

表A:3列

var nodeoutlook = require('nodejs-nodemailer-outlook')
var nodemailer = require("nodemailer");
const promisify = require("es6-promisify");

nodeoutlook.sendEmail({
    auth: {
        user: "johnexample@organization.com",
        pass: "johnpassword"
    }, from: 'info@myorganization.com',
    to: 'jackexample@organization.com',
    subject: 'Hey you, awesome!',
    html: '<b>This is bold text</b>',
    text: 'This is text version!'
    attachments: [
                   {   // file on disk as an attachment
                        filename: 'text3.txt',
                        path: '/path/to/file.txt' // stream this file
                   }
                ]
});


async function main(){
  let account = await nodemailer.createTestAccount();
  let transporter = nodemailer.createTransport({
    host: "smtp.ethereal.email",
    port: 587,
    secure: false,
    auth: {
      user: account.user, 
      pass: account.pass 
    }
  });

  let mailOptions = {
    from: '"Fred " 
    to: "bar@example.com, baz@example.com", // list of receivers
    subject: "Hello",
    text: "Hello world?", // plain text body
    html: "<b>Hello world?</b>" // html body
  };
  let info = await transporter.sendMail(mailOptions)
}

main().catch(console.error)

ID变量是唯一的,而COUNT变量的离散范围是1-12。

表B:13列

ID     COUNT       MEAN

A           2      0.034

B       4     -0.052

C       7      0.327

..  ..  ..

CLUSTER变量当前的范围是1-12,但它没有固定,因此代码需要 能够合并集群数量可能不同的边缘情况。

问题: 从表A,我想使用COUNT变量作为参考来找到最接近的 MEAN到对应MEAN(N)中可能的MEAN值列表 从表B,然后从与关联的行中选择关联的CLUSTER号 最接近的匹配项。

例如,ID'A'的COUNT为2,平均值为0.034,因此我需要找到最接近的 表B中MEAN2列的可能MEAN值列表中该ID的平均值。 我需要针对12个可能的COUNT值和12个MEAN列执行此操作。

数组和索引已超出我的理解范围,因此不胜感激, 布兰登

1 个答案:

答案 0 :(得分:0)

如果将聚类均值转置到此布局中,查找将变得更加容易

Cluster Count Mean
1       1     0.344
1       2     0.234
1       3     0.233
…
2       1     0.234
2       2     0.234
2       3     0.343
… 

然后您可以使用SQL选择与所观察到的均值最接近的聚类均值

    select 
      id,
      clusters_tall.cluster,
      clusters_tall.mean as cluster_mean,
      observed.mean - clusters_tall.mean as delta
    from 
      observed 
    join 
      clusters_tall 
    on 
      observed.count = clusters_tall.count
    group by 
      observed.id
    having 
      abs(observed.mean - clusters_tall.mean) = min (abs (observed.mean - clusters_tall.mean))

示例:

* create some sample data for clusters;
data clusters;
  do cluster = 1 to 50;
    array mean(12);
    call missing (of mean(*));
    do _n_ = 1 to ceil(12*ranuni(123));
      mean(_n_) = round(ranuni(123),0.0001);
    end;
    output;
  end;
run;

* create some sample data for observed;
* 60% of the id will be given a mean close to a randomly selected (picked) existing cluster mean;
data observed (keep=id count mean pick flag);
  length id count mean 8.;

  do id = 1 to 1000;
    pick = ceil(50*ranuni(123));

    point = pick;
    set clusters point=point;

    mean = .;

    if ranuni(123) < 0.6 then do until (not missing(mean));
      array means mean1-mean12;
      flag = '*';
      count = ceil(12 * ranuni(123));
      if not missing(means(count)) then 
        mean = means(count) + round((ranuni(123)-0.5) / 1e3,0.0001); 
    end;
    else do;
      flag = 'R';
      count = ceil (12 * ranuni(123));
      mean = round(ranuni(123),0.001) + 0.0005;
    end;

    output;
  end;
  stop;
run;

* use data step to transpose the 'wide' form of cluster data into the 'cluster/count/mean' layout;;
data clusters_tall(keep=cluster count mean);
  set clusters;
  array means mean1-mean12;
  do _n_ = 1 to dim(means) while (not missing(means(_n_)));
    count = _n_;
    mean = means(_n_);
    output;
  end;
run;

* match the observed mean to the closest cluster mean;
* inner join in sub-select is used to get closest cluster, but there could be
* more than one cluster with same closest state.;
* Thus, left join to the sub-select and chose the one
* with the lowest cluster number in case of a tie;

proc sql;
  create table observed_matched(label="Observed matched to cluster with closest mean") as

  select observed.*, matched.cluster, matched.cluster_mean, matched.delta
  from

  observed
  left join 
  ( 
    select 
      id,
      clusters_tall.cluster,
      clusters_tall.mean as cluster_mean,
      observed.mean - clusters_tall.mean as delta
    from 
      observed 
    join 
      clusters_tall 
    on 
      observed.count = clusters_tall.count
    group by 
      observed.id
    having 
      abs(observed.mean - clusters_tall.mean) = min (abs (observed.mean - clusters_tall.mean))
  ) as matched

  on observed.id = matched.id
  group by observed.id
  having cluster = min(cluster)
  order by id, cluster
  ;