我的数据集有5500个观测值。我想根据年龄和糖尿病对1000个观察结果进行抽样。年龄分为5类:
和2类糖尿病
我想根据这些比例选择1000个观察结果:
我已经看过以前的讨论但由于场景不同而无法做到。谢谢。
答案 0 :(得分:0)
这是一种方法
生成一些假数据
nSamples <- 5500
set.seed(1)
data <- data.frame(age=sample(31:79,nSamples,replace=TRUE),
diabetic=sample(c('Yes','No'),nSamples,replace=TRUE))
data$age<-cut(data$age,c(30,40,50,60,70,80))
addmargins(prop.table(table(data)))
假数据具有以下分布
diabetic
age No Yes Sum
(30,40] 0.10145455 0.10563636 0.20709091
(40,50] 0.10763636 0.10400000 0.21163636
(50,60] 0.10363636 0.09509091 0.19872727
(60,70] 0.09818182 0.09109091 0.18927273
(70,80] 0.09581818 0.09745455 0.19327273
Sum 0.50672727 0.49327273 1.00000000
获取仅代表目标年龄概率的分布
# Samples based on age probabilities
index_age_1 <- which(data$age==levels(data$age)[1])
index_age_2 <- which(data$age==levels(data$age)[2])
index_age_3 <- which(data$age==levels(data$age)[3])
index_age_4 <- which(data$age==levels(data$age)[4])
index_age_5 <- which(data$age==levels(data$age)[5])
prob_age=c(0.8, 4.8, 12.8, 24.4, 57.2)
prob_age=prob_age/sum(prob_age)
n_age=prob_age*1000
ageSampleIndex <- c(sample(index_age_1,n_age[1]),
sample(index_age_2,n_age[2]),
sample(index_age_3,n_age[3]),
sample(index_age_4,n_age[4]),
sample(index_age_5,n_age[5]))
addmargins(prop.table(table(data[ageSampleIndex,])))
检查年龄分布
diabetic
age No Yes Sum
(30,40] 0.004 0.004 0.008
(40,50] 0.024 0.024 0.048
(50,60] 0.067 0.061 0.128
(60,70] 0.130 0.114 0.244
(70,80] 0.281 0.291 0.572
Sum 0.506 0.494 1.000
获得仅限糖尿病的分发
# Samples based on diabetic probabilities
index_diabetic_1 <- which(data$diabetic==levels(data$diabetic)[1])
index_diabetic_2 <- which(data$diabetic==levels(data$diabetic)[2])
prob_diabetic=c(32.8, 67.2)
prob_diabetic=prob_diabetic/sum(prob_diabetic)
n_diabetic=prob_diabetic*1000
diabeticSampleIndex <- c(sample(index_diabetic_1,n_diabetic[1]),
sample(index_diabetic_2,n_diabetic[2]))
addmargins(prop.table(table(data[diabeticSampleIndex,])))
检查仅限糖尿病的分布
diabetic
age No Yes Sum
(30,40] 0.06306306 0.15215215 0.21521522
(40,50] 0.07407407 0.14914915 0.22322322
(50,60] 0.07407407 0.13013013 0.20420420
(60,70] 0.05905906 0.12112112 0.18018018
(70,80] 0.05705706 0.12012012 0.17717718
Sum 0.32732733 0.67267267 1.00000000
获得年龄和糖尿病分布
# Samples based on age and diabetic probabilities
index_age_1_diabetic_1 <- which(data$age==levels(data$age)[1] &
data$diabetic==levels(data$diabetic)[1])
index_age_1_diabetic_2 <- which(data$age==levels(data$age)[1] &
data$diabetic==levels(data$diabetic)[2])
index_age_2_diabetic_1 <- which(data$age==levels(data$age)[2] &
data$diabetic==levels(data$diabetic)[1])
index_age_2_diabetic_2 <- which(data$age==levels(data$age)[2] &
data$diabetic==levels(data$diabetic)[2])
index_age_3_diabetic_1 <- which(data$age==levels(data$age)[3] &
data$diabetic==levels(data$diabetic)[1])
index_age_3_diabetic_2 <- which(data$age==levels(data$age)[3] &
data$diabetic==levels(data$diabetic)[2])
index_age_4_diabetic_1 <- which(data$age==levels(data$age)[4] &
data$diabetic==levels(data$diabetic)[1])
index_age_4_diabetic_2 <- which(data$age==levels(data$age)[4] &
data$diabetic==levels(data$diabetic)[2])
index_age_5_diabetic_1 <- which(data$age==levels(data$age)[5] &
data$diabetic==levels(data$diabetic)[1])
index_age_5_diabetic_2 <- which(data$age==levels(data$age)[5] &
data$diabetic==levels(data$diabetic)[2])
prob_age_diabetic = prob_age %*% t(prob_diabetic)
n_prob_age_diabetic = prob_age_diabetic * 1000
ageDiabeticSampleIndex <- c(sample(index_age_1_diabetic_1,n_prob_age_diabetic[1,1]),
sample(index_age_1_diabetic_2,n_prob_age_diabetic[1,2]),
sample(index_age_2_diabetic_1,n_prob_age_diabetic[2,1]),
sample(index_age_2_diabetic_2,n_prob_age_diabetic[2,2]),
sample(index_age_3_diabetic_1,n_prob_age_diabetic[3,1]),
sample(index_age_3_diabetic_2,n_prob_age_diabetic[3,2]),
sample(index_age_4_diabetic_1,n_prob_age_diabetic[4,1]),
sample(index_age_4_diabetic_2,n_prob_age_diabetic[4,2]),
sample(index_age_5_diabetic_1,n_prob_age_diabetic[5,1]),
sample(index_age_5_diabetic_2,n_prob_age_diabetic[5,2]))
addmargins(prop.table(table(data[ageDiabeticSampleIndex,])))
检查年龄和糖尿病分布
diabetic
age No Yes Sum
(30,40] 0.002010050 0.005025126 0.007035176
(40,50] 0.015075377 0.032160804 0.047236181
(50,60] 0.041206030 0.086432161 0.127638191
(60,70] 0.080402010 0.163819095 0.244221106
(70,80] 0.187939698 0.385929648 0.573869347
Sum 0.326633166 0.673366834 1.000000000