还在学习这个很棒的包data.table。我正在处理以下data.table:
function query($query) {
return $this->link->prepare($query);
}
我正在尝试生成新列(age_gender band)为(" F0_34"," F35_44"," F45_54"," F55_59&#34 ; ........" F95_GT")和(" M0_34"," M35_44"," M45_54"," ; M55_59" ........" M95_GT")基于列性和年龄的值,将生成它们的名称和值。我能够以一种简单的方式做到:
demo <- data.table(id = c(1, 2, 3, 4, 5, 6), sex = c(1, 2, 1, 2, 2, 2), agef = c(43, 53, 63, 73, 83, 103))
demo:
id sex agef
1 1 43
2 2 53
3 1 63
4 2 73
5 2 83
6 2 103
但是我正在寻找一个优雅的解决方案,我试图将age_band作为lapply函数中的列表传递,如下所示:
demo <- demo[ ,F0_34:= {ifelse((sex==2) & (agef >= 0) & (agef <= 34), 1, 0)}]
我得到了所需的输出:
i <- list("0_34","35_44","45_54","55_59","60_64","65_69","70_74","75_79","80_84","85_89","90_94","95_GT")
demo[, paste0("F", i) := lapply(i, function(i)lapply(.SD, function(x){
l1 <- unlist(str_split(i, "_"))
if(l1[2] == "GT") l1[2] <- 1000
l1 <- as.numeric(l1)
score <- ifelse((sex==2) & (agef >= l1[1]) & (agef <= l1[2]), 1, 0)
return(score)
})), .SDcols = c("sex", "agef"), by = id]
demo[, paste0("M", i) := lapply(i, function(i)lapply(.SD, function(x){
l1 <- unlist(str_split(i, "_"))
if(l1[2] == "GT") l1[2] <- 1000
l1 <- as.numeric(l1)
score <- ifelse((sex==1) & (agef >= l1[1]) & (agef <= l1[2]), 1, 0)
return(score)
})), .SDcols = c("sex", "agef"), by = id]
但有一些警告:
id sex agef F0_34 F35_44 F45_54 F55_59 F60_64 F65_69 F70_74 F75_79 F80_84 F85_89 F90_94 F95_GT M0_34 M35_44 M45_54 M55_59 M60_64 M65_69 M70_74 M75_79 M80_84 M85_89 M90_94 M95_GT
1 1 43 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
2 2 53 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 1 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
4 2 73 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 2 83 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 2 103 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
我无法理解,有人能指出我做错了吗?
答案 0 :(得分:3)
这是你正在寻找的东西:
age.brackets <- c(0,seq(35,55, by=10), seq(60,95, by=5), Inf) #age ranges
ranges <- (cut(demo$agef, age.brackets))
split(demo, demo$sex)
spread <- table(demo$agef, ranges) #identify persons in each range
male.spread <- (demo$sex=='1')*as.matrix(spread)
female.spread <- (demo$sex=='2')*as.matrix(spread)
newdt <- data.table(
cbind(
demo,
matrix(as.vector(male.spread), ncol=ncol(male.spread)),
matrix(as.vector(female.spread), ncol=ncol(female.spread))
)
)
#column names
names(newdt) <- c(names(demo),
levels(cut(demo$agef, age.brackets)),
levels(cut(demo$agef, age.brackets))
)
female.names <- gsub('.(\\d*),(\\d*|Inf).', 'F\\1_\\2', levels(cut(demo$agef, age.brackets)))
male.names <- gsub('.(\\d*),(\\d*|Inf).', 'M\\1_\\2', levels(cut(demo$agef, age.brackets)))
names(newdt) <- c(names(demo), female.names, male.names)
newdt
# id sex agef F0_35 F35_45 F45_55 F55_60 F60_65 F65_70 F70_75 F75_80 F80_85 F85_90
# 1: 1 1 43 0 1 0 0 0 0 0 0 0 0
# 2: 2 2 53 0 0 0 0 0 0 0 0 0 0
# 3: 3 1 63 0 0 0 0 1 0 0 0 0 0
# 4: 4 2 73 0 0 0 0 0 0 0 0 0 0
# 5: 5 2 83 0 0 0 0 0 0 0 0 0 0
# 6: 6 2 103 0 0 0 0 0 0 0 0 0 0
# F90_95 F95_Inf M0_35 M35_45 M45_55 M55_60 M60_65 M65_70 M70_75 M75_80 M80_85 M85_90
# 1: 0 0 0 0 0 0 0 0 0 0 0 0
# 2: 0 0 0 0 1 0 0 0 0 0 0 0
# 3: 0 0 0 0 0 0 0 0 0 0 0 0
# 4: 0 0 0 0 0 0 0 0 1 0 0 0
# 5: 0 0 0 0 0 0 0 0 0 0 1 0
# 6: 0 0 0 0 0 0 0 0 0 0 0 0
# M90_95 M95_Inf
# 1: 0 0
# 2: 0 0
# 3: 0 0
# 4: 0 0
# 5: 0 0
# 6: 0 1
答案 1 :(得分:3)
这应该有用{更多data.table
- y:
cut_points <- c(0, seq(35, 55, by = 10), seq(60, 95, by = 5),Inf)
new_names_m <- paste0("M", cut_points[1:12], "_", c(cut_points[2:12], "GT"))
new_names_f <- paste0("F", cut_points[1:12], "_", c(cut_points[2:12], "GT"))
demo[sex == 1, ranges := cut(agef, cut_points, include.lowest = TRUE,
labels = new_names_m)]
demo[sex == 2, ranges := cut(agef, cut_points, include.lowest = TRUE,
labels = new_names_f)]
demo[ ,(c(new_names_m, new_names_f)) :=
lapply(c(new_names_m, new_names_f), function(x) +(ranges == x))]
demo[ , ranges := NULL]
> demo
id sex agef M0_35 M35_45 M45_55 M55_60 M60_65 M65_70 M70_75 M75_80 M80_85 M85_90 M90_95 M95_GT F0_35 F35_45 F45_55 F55_60 F60_65
1: 1 1 43 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2: 2 2 53 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
3: 3 1 63 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
4: 4 2 73 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5: 5 2 83 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6: 6 2 103 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
F65_70 F70_75 F75_80 F80_85 F85_90 F90_95 F95_GT
1: 0 0 0 0 0 0 0
2: 0 0 0 0 0 0 0
3: 0 0 0 0 0 0 0
4: 0 1 0 0 0 0 0
5: 0 0 0 1 0 0 0
6: 0 0 0 0 0 0 1
或者,代替倒数第二行中的lapply
,可以将虚拟对象初始化为零,然后将其分配到适当的位置:
new_names = c(new_names_f, new_names_m)
demo[ , (new_names) := 0L]
is = which(demo$ranges != "")
js = 3L + match(demo$ranges[is], new_names)
for (iter in seq_along(is)) set(demo, i = is[iter], j = js[iter], value = 1L)