Question

我有大约2330000次观测，我想分配均匀间隔10,000个水桶。桶标准是max（var）-min（var）/ 10,000。例如，我的最大值是3000，我的最小值是-200，所以我的桶大小将是（3000 + 200）/10,000=0.32。因此-200到（-200 + 0.32）之间的任何值都应该进入桶1，并且（-200 + 0.32）到（ - 200 + 0.32 * 2）之间的任何值都应该进入桶2，依此类推。数据集将是这样的：

Var_value           bucket
-200                    1
-53                     ?
-5                      ?
-46                     ?
5
8
4
56
7542
242
....

如何编写代码？我在想循环，但不知道怎么做？有人可以帮忙吗？

Answer 1

不确定你会对建议的循环做什么，但这就是我要做的事情：

/* get some data to play with */
data a(keep=val);
do i=1 to 1000000;
    val = 3200*ranuni(0)-200;
    output;
end;
run;
/* groups=xxx specifies the number of buckets
   var yyy is the name of the variable whose values we'd like to classify
   ranks zzz specifies the name of the variable containing the assigned rank
*/
proc rank data=a out=b groups=10000;
var val;
ranks bucket;
run;

Answer 2

以下是您可以使用的另一种方法：

生成随机模拟数据

data have;
    do i=1 to 250000;
        /*Seed is `_N_` so we'll see the same random item count.*/
        var_value = (ranuni(_N_)-0.5)*8000;
        output;
    end;
    drop i;
run;

<强>将（S）

/*Desired number of buckets.*/
    %let num_buckets = 10000;

/*Determine bucket size and minimum var_value*/
    proc sql noprint;
        select (max(var_value)-min(var_value))/&num_buckets.,
                min(var_value) 
        into : bucket_size,
             : min_var_value
        from have;
    quit;
    %put bucketsize: &bucket_size.;
    %put min var_value: &min_var_value.;

/*    1 - Assign buckets using data step */
    data want;
        set have;
        bucket = max(ceil((var_value-&min_var_value.)/&bucket_size.),1);
    run;
    proc sort data=want;
        by bucket;
    run;

/* or 2 - Assign buckets using proc sql*/
    proc sql;
        create table want as
        select var_value,
               max(ceil((var_value-&min_var_value.)/&bucket_size.),1) as bucket
        from have
        order by CALCULATED bucket;
    quit;

做循环分配桶

2 个答案: