进行调查后,我以数据框的形式收集了结果。这是实际数据框外观的可复制版本。
library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
"1105","1106","1107","1108",
"1109","1110","1111","1112",
"1113","1114","1115","1116",
"1117","1118","1119","1120",
"1121","1122","1123","1124",
"1125","1126","1127","1128",
"1129","1130","1131","1132",
"1133","1134","1135","1136",
"1137","1138","1139","1140",
"1141","1142","1143","1144",
"1145","1146","1147","1148",
"1149","1150","1151","1152",
"1153","1154","1155","1156"),
Country=c("US","UK","Canada","Mexico",
"India","US","Peru","China",
"US","UK","Canada","Mexico",
"Portugal","India","Portugal","Mexico",
"Peru","India","Canada","Mexico",
"India","UK","India","Canada",
"US","UK","China","India",
"US","Mexico","Canada","Mexico",
"Canada","China","Canada","Canada",
"China","China","India","Mexico",
"Portugal","Portugal","Portugal","Portugal",
"UK","UK","UK","Peru",
"Peru","Mexico","US","US",
"Peru","Mexico","Peru","Mexico"),
Gender=c("Male","Male","Male","Female",
"Female","Female","Male","Female",
"Female","Female","Male","Female",
"Male","Male","Female","Female",
"Female","Male","Female","Female",
"Female","Female","Male","Female",
"Male","Female","Male","Female",
"Female","Male","Female","Female",
"Male","Male","Male","Female",
"Male","Male","Female","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Male","Male","Male"),
Age=c("<25","25-35","25-35","36-45",
">55",">55","25-35",">55",
"<25","25-35","25-35","36-45",
"25-35","25-35","25-35","36-45",
">55","36-45","46-55","36-45",
">55","46-55","25-35","46-55",
"<25","46-55","25-35","46-55",
"25-35","25-35","46-55","36-45",
"<25","<25",">55","36-45",
"36-45","46-55","<25","<25",
"<25",">55","36-45","46-55",
"<25",">55","36-45","46-55",
"36-45",">55","36-45","46-55",
"<25","46-55","<25","46-55"),
Score_Q1=c(4,4,3,2,
1,1,4,2,
1,1,1,2,
2,1,4,3,
4,3,1,1,
1,2,1,1,
1,4,1,4,
3,4,3,3,
1,3,3,1,
1,1,2,1,
1,2,1,2,
1,1,1,1,
2,2,2,2,
1,2,3,4),
Score_Q2=c(1,4,1,1,
1,2,1,1,
1,4,4,4,
2,1,1,3,
4,3,1,1,
1,3,3,3,
2,4,1,2,
4,4,4,4,
1,1,1,1,
1,2,3,4,
4,4,2,1,
1,2,3,2,
1,2,1,2,
4,3,2,1))
数据帧可以分为以下部分-
1) ID :受访者ID
2)国家/地区:受访者的原籍国
3)性别:受访者的性别
4)年龄:受访者年龄
5) Score_Q1 :Q1的满意度得分,范围从1
(非常满意)到4
(非常不满意)。
6) Score_Q2 :Q2的满意度得分,从1
(非常满意)到4
(非常不满意)。
首先进行一些数据清理-
#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)
现在,我检查数据集中的年龄和性别比率-
性别由Country
#1) Gender by Country
split_gender=df %>% select(Country,Gender) %>%
group_by(Gender,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Gender,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"
Age
由Country
#2) Age by Country
split_age=df %>% select(Country,Age) %>%
group_by(Age,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Age,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"
#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))
以上两个步骤为我提供了两个数据帧-split_age_sample
和split_gender_sample
。这些数据框包含了我的56位受访者的年龄和性别抽样比率。
我的目标:基于人口统计数据计算抽样权重
为了使我的数据框更能代表现实,我想根据国家/地区的年龄和性别的官方人口比率将权重归因于受访者
这些是我在接受调查的国家/地区发现的官方人口比例。
#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
Canada=c(0.4,0.6),
China=c(0.3,0.7),
India=c(0.3,0.7),
Mexico=c(0.5,0.5),
Peru=c(0.6,0.4),
Portugal=c(0.5,0.5),
UK=c(0.4,0.6),
US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
Canada=c(0.1,0.3,0.3,0.2,0.1),
China=c(0.3,0.05,0.35,0.1,0.2),
India=c(0.5,0.05,0.35,0.05,0.05),
Mexico=c(0.2,0.3,0.2,0.1,0.2),
Peru=c(0.1,0.3,0.2,0.2,0.2),
Portugal=c(0.2,0.1,0.05,0.05,0.6),
UK=c(0.2,0.3,0.1,0.3,0.1),
US=c(0.2,0.3,0.1,0.3,0.1))
所需的输出
根据我的抽样比例以及年龄和性别的正式人口比例,我想在一个称为weights
的单独列中为受访者分配权重。
目前,我无法弄清楚如何进行此计算。
然后,一旦计算了权重,我想使用weights
列来汇总分数。聚合看起来像这样(除了权重已计入计算)-
示例:英国的加权汇总分数
#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
select(-Age) %>%
group_by(Country,Gender) %>%
filter(Country=='UK') %>%
summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))
我真的很感谢我在随后的加权聚合步骤中对权重计算及其用法所能提供的任何帮助。
答案 0 :(得分:0)
不知道它是否正是您要寻找的东西,但是这是我想出的。您需要将国家权重与数据框合并,然后才能计算KPI。
> # Reshape national weights
> Nombres <- cbind.data.frame("Country" = colnames(split_gender_official)[colnames(split_gender_official) != "Gender"],
+ "time" = 1:length(colnames(split_gender_official)[colnames(split_gender_official) != "Gender"]))
> Nombres$Country <- as.character(Nombres$Country)
>
> split_gender_official_resh <- reshape(split_gender_official, direction = "long", varying = Nombres$Country, v.names = "Weights_gend")
> split_age_official_resh <- reshape(split_age_official, direction = "long", varying = Nombres$Country, v.names = "Weights_age")
>
> split_gender_official_resh$id <- NULL
> split_age_official_resh$id <- NULL
>
> split_gender_official_resh <- merge(split_gender_official_resh, Nombres, by = "time", all.x = TRUE)
> split_age_official_resh <- merge(split_age_official_resh, Nombres, by = "time", all.x = TRUE)
>
> split_gender_official_resh$time <- NULL
> split_age_official_resh$time <- NULL
> # Merge weights with df
> df <- merge(df, split_gender_official_resh, by = c("Gender", "Country"), all.x = TRUE)
> df <- merge(df, split_age_official_resh, by = c("Age", "Country"), all.x = TRUE)
>
> # Print tables
>
> # Without weights
>
> prop.table(table(df$Gender, df$Country), 2)
Canada China India Mexico Peru Portugal UK US
Female 0.5000000 0.2000000 0.5714286 0.7000000 0.3333333 0.5000000 0.5714286 0.5714286
Male 0.5000000 0.8000000 0.4285714 0.3000000 0.6666667 0.5000000 0.4285714 0.4285714
> prop.table(table(df$Age, df$Country), 2)
Canada China India Mexico Peru Portugal UK US
<25 0.1250000 0.2000000 0.1428571 0.1000000 0.3333333 0.1666667 0.1428571 0.4285714
>55 0.1250000 0.2000000 0.2857143 0.1000000 0.1666667 0.1666667 0.1428571 0.1428571
25-35 0.2500000 0.2000000 0.2857143 0.1000000 0.1666667 0.3333333 0.2857143 0.1428571
36-45 0.1250000 0.2000000 0.1428571 0.5000000 0.1666667 0.1666667 0.1428571 0.1428571
46-55 0.3750000 0.2000000 0.1428571 0.2000000 0.1666667 0.1666667 0.2857143 0.1428571
>
> # With weights
> prop.table(xtabs(Weights_gend ~ Gender + Country, df), 2)
Country
Gender Canada China India Mexico Peru Portugal UK US
Female 0.40000000 0.09677419 0.36363636 0.70000000 0.42857143 0.50000000 0.47058824 0.47058824
Male 0.60000000 0.90322581 0.63636364 0.30000000 0.57142857 0.50000000 0.52941176 0.52941176
> prop.table(xtabs(Weights_age ~ Gender + Country, df), 2)
Country
Gender Canada China India Mexico Peru Portugal UK US
Female 0.3333333 0.0500000 0.4642857 0.6250000 0.4545455 0.7142857 0.5000000 0.5000000
Male 0.6666667 0.9500000 0.5357143 0.3750000 0.5454545 0.2857143 0.5000000 0.5000000
>
> # Means with weights and scores
> tapply(df$Score_Q1 * df$Weights_gend, list(df$Gender, df$Country), mean)
Canada China India Mexico Peru Portugal UK US
Female 0.6 0.60 0.600000 1.000000 1.5 1.3333333 0.8 0.7
Male 1.2 1.05 1.166667 1.666667 1.0 0.6666667 1.2 1.4
> tapply(df$Score_Q1 * df$Weights_age, list(df$Age, df$Country), mean)
Canada China India Mexico Peru Portugal UK US
<25 0.1000000 0.90 1.00 0.20 0.2 0.20 0.20 0.4
>55 0.9000000 0.10 0.05 0.60 1.2 0.20 0.30 0.3
25-35 0.6000000 0.35 0.35 0.80 0.8 0.15 0.25 0.3
36-45 0.2000000 0.10 0.15 0.22 0.4 0.05 0.30 0.6
46-55 0.1666667 0.20 0.20 0.60 0.2 1.20 0.30 0.2
>
希望有帮助。