我想根据列名的匹配值/类和另一列中提供的信息来重新填充数据框。
这是一个假设的数据框:
> mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) ,
+ C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
> mat.data
A B C D cat
1 0 0 0 A
1 1 0 0 A
0 1 0 0 C
0 0 0 1 B
我以某种方式设法通过使用匹配函数(例如match(mat.data[,5],colnames(mat.data[1:4]))
)来提取匹配值。但是,我在合理的时间内无法获得想要的输出。
我想根据数据的列名与第5列之间的真实匹配来重新填充0-1值(因此,当给定行的第5列为A时,我想在“列名为“ A”,其他列为“ 0”)。
为获得更好的解释,所需的输出为:
> mat.data
A B C D cat
1 0 0 0 A
1 0 0 0 A
0 0 1 0 C
0 1 0 0 B
任何使它整洁且不那么复杂的建议都是很好的。
答案 0 :(得分:4)
一种可能的方法是使用model.matrix
重新创建矩阵,但首先要确保cat
变量具有与原始矩阵的列名称相对应的级别:
mat.data$cat <- factor(mat.data$cat, levels = head(names(mat.data), -1))
new.mat <- data.frame(model.matrix( ~ mat.data$cat - 1))
names(new.mat) <- levels(mat.data$cat)
new.mat
A B C D
1 1 0 0 0
2 1 0 0 0
3 0 0 1 0
4 0 1 0 0
答案 1 :(得分:3)
带有data.table::dcast
的另一个选项:
library(data.table)
setDT(mat.data)
mat.data[, cat := factor(cat, levels = names(mat.data)[1:4])]
res <- dcast(mat.data, cat + seq_along(cat) ~ cat, fun.agg = length, fill = 0, drop = c(T, F))
res[, cat_1 := NULL]
# > res
# cat A B C D
# 1: A 1 0 0 0
# 2: A 1 0 0 0
# 3: B 0 1 0 0
# 4: C 0 0 1 0
答案 2 :(得分:3)
这是使用sapply
并依赖于逻辑到数字转换的一种方式:
> cat <- c("A", "A", "C", "B")
> lvls <- LETTERS[1:4]
>
> mat.data <- t(sapply(cat, function(x) as.numeric(lvls == x)))
> colnames(mat.data) <- lvls
> mat.data
A B C D
A 1 0 0 0
A 1 0 0 0
C 0 0 1 0
B 0 1 0 0
到目前为止所有答案的时间:
> microbenchmark(
+ model.matrix = {
+ mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) ,
+ C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+ mat.data$cat <- factor(mat.data$cat, levels = head(names(mat.data), -1))
+ new.mat <- data.frame(model.matrix( ~ mat.data$cat - 1))
+ names(new.mat) <- levels(mat.data$cat)
+ },
+ dcast = {
+ mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) ,
+ C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+ setDT(mat.data)
+ mat.data[, cat := factor(cat, levels = names(mat.data)[1:4])]
+ res <- dcast(mat.data, cat + seq_along(cat) ~ cat, fun.agg = length, fill = 0, drop = c(T, F))
+ res[, cat_1 := NULL]
+ },
+ outer = {
+ mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) ,
+ C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+ match_cols <- setdiff(names(mat.data), "cat")
+ new.data <- outer(X = mat.data[["cat"]], Y = match_cols, stringi::stri_count_fixed)
+ colnames(new.data) <- match_cols
+ cbind(new.data, mat.data["cat"])
+ },
+ sapply = {
+ mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) ,
+ C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+ lvls <- LETTERS[1:4]
+ new.mat <- t(sapply(mat.data$cat, function(x) as.numeric(lvls == x)))
+ colnames(new.mat) <- lvls
+ },
+ tidy = {
+ mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) ,
+ C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+ mat.data[5] %>%
+ rowid_to_column %>%
+ mutate(value=1) %>%
+ spread(cat,value, fill=0) %>%
+ select(-rowid)
+ }
+ )
Using 'cat' as value column. Use 'value.var' to override (x100)
Unit: microseconds
expr min lq mean median uq max neval
model.matrix 894.835 1027.983 1185.7946 1173.6940 1313.258 1640.453 100
dcast 4432.031 4935.079 5603.5700 5290.8000 5725.408 12495.376 100
outer 508.123 564.671 666.4618 610.9195 758.261 1008.386 100
sapply 463.534 496.724 611.6146 549.5260 672.997 2526.964 100
tidy 3936.329 4525.921 5000.3296 4917.7735 5257.409 10660.893 100
答案 3 :(得分:1)
使用!j2
和# File base-template-params.yml.inc
- string:
name: foo
default: FOO
description: Foofoo
# File templates.yml
- job-template: &base-template
name: base-params
parameters:
!include base-template-params.yml.inc
- job-template:
name: more-params
<< : *base-template
parameters:
!include base-template-params.yml.inc
- string:
name: bar
default: BAR
description: Baaaa
- project:
name: thing
jobs:
- more-params
的解决方案
outer
没有stringi::stri_count_fixed
,您可以做到
match_cols <- setdiff(names(mat.data), "cat")
new.data <- outer(X = mat.data[["cat"]], Y = match_cols, stringi::stri_count_fixed)
colnames(new.data) <- match_cols
cbind(new.data, mat.data["cat"])
# A B C D cat
#1 1 0 0 0 A
#2 1 0 0 0 A
#3 0 0 1 0 C
#4 0 1 0 0 B
答案 4 :(得分:1)
这是基于tidyverse
的{{1}}解决方案:
tidyr::spread
如您所见,library(tidyverse)
mat.data[5] %>%
rowid_to_column %>%
mutate(value=1) %>%
spread(cat,value, fill=0) %>%
select(-rowid)
# A B C
# 1 1 0 0
# 2 1 0 0
# 3 0 0 1
# 4 0 1 0
不存在,但是如果D
列中有任何"D"
,它将存在。