如何通过2个变量进行分层抽样?

时间:2014-12-23 14:40:05

标签: r random-sample

我的数据集有5500个观测值。我想根据年龄和糖尿病对1000个观察结果进行抽样。年龄分为5类:

  1. 31-40
  2. 41-50
  3. 51-60
  4. 61-70
  5. 大于70
  6. 和2类糖尿病

    1. YES
    2. NO
    3. 我想根据这些比例选择1000个观察结果:

      1. 来自年龄段:0.8,4.8,12.8,24.4,57.2和
      2. 来自糖尿病频率:32.8,67.2
      3. 我已经看过以前的讨论但由于场景不同而无法做到。谢谢。

1 个答案:

答案 0 :(得分:0)

这是一种方法

生成一些假数据

nSamples <- 5500
set.seed(1)
data <- data.frame(age=sample(31:79,nSamples,replace=TRUE),
                   diabetic=sample(c('Yes','No'),nSamples,replace=TRUE))
data$age<-cut(data$age,c(30,40,50,60,70,80))

addmargins(prop.table(table(data)))

假数据具有以下分布

         diabetic
age               No        Yes        Sum
  (30,40] 0.10145455 0.10563636 0.20709091
  (40,50] 0.10763636 0.10400000 0.21163636
  (50,60] 0.10363636 0.09509091 0.19872727
  (60,70] 0.09818182 0.09109091 0.18927273
  (70,80] 0.09581818 0.09745455 0.19327273
  Sum     0.50672727 0.49327273 1.00000000

获取仅代表目标年龄概率的分布

# Samples based on age probabilities
index_age_1 <- which(data$age==levels(data$age)[1])
index_age_2 <- which(data$age==levels(data$age)[2])
index_age_3 <- which(data$age==levels(data$age)[3])
index_age_4 <- which(data$age==levels(data$age)[4])
index_age_5 <- which(data$age==levels(data$age)[5])

prob_age=c(0.8, 4.8, 12.8, 24.4, 57.2)
prob_age=prob_age/sum(prob_age)
n_age=prob_age*1000
ageSampleIndex <- c(sample(index_age_1,n_age[1]),
               sample(index_age_2,n_age[2]),
               sample(index_age_3,n_age[3]),
               sample(index_age_4,n_age[4]),
               sample(index_age_5,n_age[5]))
addmargins(prop.table(table(data[ageSampleIndex,])))

检查年龄分布

         diabetic
age          No   Yes   Sum
  (30,40] 0.004 0.004 0.008
  (40,50] 0.024 0.024 0.048
  (50,60] 0.067 0.061 0.128
  (60,70] 0.130 0.114 0.244
  (70,80] 0.281 0.291 0.572
  Sum     0.506 0.494 1.000

获得仅限糖尿病的分发

# Samples based on diabetic probabilities
index_diabetic_1 <- which(data$diabetic==levels(data$diabetic)[1])
index_diabetic_2 <- which(data$diabetic==levels(data$diabetic)[2])

prob_diabetic=c(32.8, 67.2)
prob_diabetic=prob_diabetic/sum(prob_diabetic)
n_diabetic=prob_diabetic*1000
diabeticSampleIndex <- c(sample(index_diabetic_1,n_diabetic[1]),
                    sample(index_diabetic_2,n_diabetic[2]))
addmargins(prop.table(table(data[diabeticSampleIndex,])))

检查仅限糖尿病的分布

         diabetic
age               No        Yes        Sum
  (30,40] 0.06306306 0.15215215 0.21521522
  (40,50] 0.07407407 0.14914915 0.22322322
  (50,60] 0.07407407 0.13013013 0.20420420
  (60,70] 0.05905906 0.12112112 0.18018018
  (70,80] 0.05705706 0.12012012 0.17717718
  Sum     0.32732733 0.67267267 1.00000000

获得年龄和糖尿病分布

# Samples based on age and diabetic probabilities

index_age_1_diabetic_1 <- which(data$age==levels(data$age)[1] &
                                  data$diabetic==levels(data$diabetic)[1])
index_age_1_diabetic_2 <- which(data$age==levels(data$age)[1] &
                                  data$diabetic==levels(data$diabetic)[2])
index_age_2_diabetic_1 <- which(data$age==levels(data$age)[2] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_2_diabetic_2 <- which(data$age==levels(data$age)[2] & 
                                  data$diabetic==levels(data$diabetic)[2])
index_age_3_diabetic_1 <- which(data$age==levels(data$age)[3] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_3_diabetic_2 <- which(data$age==levels(data$age)[3] & 
                                  data$diabetic==levels(data$diabetic)[2])
index_age_4_diabetic_1 <- which(data$age==levels(data$age)[4] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_4_diabetic_2 <- which(data$age==levels(data$age)[4] & 
                                  data$diabetic==levels(data$diabetic)[2])
index_age_5_diabetic_1 <- which(data$age==levels(data$age)[5] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_5_diabetic_2 <- which(data$age==levels(data$age)[5] & 
                                  data$diabetic==levels(data$diabetic)[2])

prob_age_diabetic = prob_age %*% t(prob_diabetic)
n_prob_age_diabetic = prob_age_diabetic * 1000
ageDiabeticSampleIndex <- c(sample(index_age_1_diabetic_1,n_prob_age_diabetic[1,1]),
                            sample(index_age_1_diabetic_2,n_prob_age_diabetic[1,2]),
                            sample(index_age_2_diabetic_1,n_prob_age_diabetic[2,1]),
                            sample(index_age_2_diabetic_2,n_prob_age_diabetic[2,2]),
                            sample(index_age_3_diabetic_1,n_prob_age_diabetic[3,1]),
                            sample(index_age_3_diabetic_2,n_prob_age_diabetic[3,2]),
                            sample(index_age_4_diabetic_1,n_prob_age_diabetic[4,1]),
                            sample(index_age_4_diabetic_2,n_prob_age_diabetic[4,2]),
                            sample(index_age_5_diabetic_1,n_prob_age_diabetic[5,1]),
                            sample(index_age_5_diabetic_2,n_prob_age_diabetic[5,2]))

addmargins(prop.table(table(data[ageDiabeticSampleIndex,])))

检查年龄和糖尿病分布

         diabetic
age                No         Yes         Sum
  (30,40] 0.002010050 0.005025126 0.007035176
  (40,50] 0.015075377 0.032160804 0.047236181
  (50,60] 0.041206030 0.086432161 0.127638191
  (60,70] 0.080402010 0.163819095 0.244221106
  (70,80] 0.187939698 0.385929648 0.573869347
  Sum     0.326633166 0.673366834 1.000000000