根据列名称中的匹配值和R中的特定列创建0-1数据框

时间:2018-10-31 12:54:28

标签: r dataframe match

我想根据列名的匹配值/类和另一列中提供的信息来重新填充数据框。

这是一个假设的数据框:

> mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) , 
+                       C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
> mat.data
 A B C D cat
 1 0 0 0   A
 1 1 0 0   A
 0 1 0 0   C
 0 0 0 1   B 

我以某种方式设法通过使用匹配函数(例如match(mat.data[,5],colnames(mat.data[1:4])))来提取匹配值。但是,我在合理的时间内无法获得想要的输出。

我想根据数据的列名与第5列之间的真实匹配来重新填充0-1值(因此,当给定行的第5列为A时,我想在“列名为“ A”,其他列为“ 0”)。

为获得更好的解释,所需的输出为:

> mat.data
 A B C D cat
 1 0 0 0   A
 1 0 0 0   A
 0 0 1 0   C
 0 1 0 0   B 

任何使它整洁且不那么复杂的建议都是很好的。

5 个答案:

答案 0 :(得分:4)

一种可能的方法是使用model.matrix重新创建矩阵,但首先要确保cat变量具有与原始矩阵的列名称相对应的级别:

mat.data$cat <- factor(mat.data$cat, levels = head(names(mat.data), -1))
new.mat <- data.frame(model.matrix( ~  mat.data$cat - 1))
names(new.mat) <- levels(mat.data$cat)

new.mat
  A B C D
1 1 0 0 0
2 1 0 0 0
3 0 0 1 0
4 0 1 0 0

答案 1 :(得分:3)

带有data.table::dcast的另一个选项:

library(data.table)
setDT(mat.data)
mat.data[, cat := factor(cat, levels = names(mat.data)[1:4])]
res <- dcast(mat.data, cat + seq_along(cat) ~ cat, fun.agg = length, fill = 0, drop = c(T, F))
res[, cat_1 := NULL]

# > res
#    cat A B C D
# 1:   A 1 0 0 0
# 2:   A 1 0 0 0
# 3:   B 0 1 0 0
# 4:   C 0 0 1 0

答案 2 :(得分:3)

这是使用sapply并依赖于逻辑到数字转换的一种方式:

> cat <- c("A", "A", "C", "B")
> lvls <- LETTERS[1:4]
> 
> mat.data <- t(sapply(cat, function(x) as.numeric(lvls == x)))
> colnames(mat.data) <- lvls
> mat.data
  A B C D
A 1 0 0 0
A 1 0 0 0
C 0 0 1 0
B 0 1 0 0

到目前为止所有答案的时间:

> microbenchmark(
+   model.matrix = {
+     mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) , 
+                                         C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+     mat.data$cat <- factor(mat.data$cat, levels = head(names(mat.data), -1))
+     new.mat <- data.frame(model.matrix( ~  mat.data$cat - 1))
+     names(new.mat) <- levels(mat.data$cat)
+   },
+   dcast = {
+     mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) , 
+                           C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+     setDT(mat.data)
+     mat.data[, cat := factor(cat, levels = names(mat.data)[1:4])]
+     res <- dcast(mat.data, cat + seq_along(cat) ~ cat, fun.agg = length, fill = 0, drop = c(T, F))
+     res[, cat_1 := NULL]
+   },
+   outer = {
+     mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) , 
+                           C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+     match_cols <- setdiff(names(mat.data), "cat")
+     new.data <- outer(X = mat.data[["cat"]], Y = match_cols, stringi::stri_count_fixed)
+     colnames(new.data) <- match_cols
+     cbind(new.data, mat.data["cat"])
+   },
+   sapply = {
+     mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) , 
+                           C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+     lvls <- LETTERS[1:4]
+     new.mat <- t(sapply(mat.data$cat, function(x) as.numeric(lvls == x)))  
+     colnames(new.mat) <- lvls
+   },
+   tidy = {
+     mat.data = data.frame(A = c(rep(1,2),rep(0,2)), B = c(0,rep(1,2),0) , 
+                           C = rep(0,4), D = c(rep(0,3),1), cat = c(rep("A",2),"C","B"))
+     mat.data[5] %>% 
+       rowid_to_column %>% 
+       mutate(value=1) %>% 
+       spread(cat,value, fill=0) %>%
+       select(-rowid)
+   }
+ )
Using 'cat' as value column. Use 'value.var' to override (x100)
Unit: microseconds
         expr      min       lq      mean    median       uq       max neval
 model.matrix  894.835 1027.983 1185.7946 1173.6940 1313.258  1640.453   100
        dcast 4432.031 4935.079 5603.5700 5290.8000 5725.408 12495.376   100
        outer  508.123  564.671  666.4618  610.9195  758.261  1008.386   100
       sapply  463.534  496.724  611.6146  549.5260  672.997  2526.964   100
         tidy 3936.329 4525.921 5000.3296 4917.7735 5257.409 10660.893   100

答案 3 :(得分:1)

使用!j2# File base-template-params.yml.inc - string: name: foo default: FOO description: Foofoo # File templates.yml - job-template: &base-template name: base-params parameters: !include base-template-params.yml.inc - job-template: name: more-params << : *base-template parameters: !include base-template-params.yml.inc - string: name: bar default: BAR description: Baaaa - project: name: thing jobs: - more-params 的解决方案

outer

没有stringi::stri_count_fixed,您可以做到

match_cols <- setdiff(names(mat.data), "cat")
new.data <- outer(X = mat.data[["cat"]], Y = match_cols, stringi::stri_count_fixed)
colnames(new.data) <- match_cols
cbind(new.data, mat.data["cat"])
#  A B C D cat
#1 1 0 0 0   A
#2 1 0 0 0   A
#3 0 0 1 0   C
#4 0 1 0 0   B

答案 4 :(得分:1)

这是基于tidyverse的{​​{1}}解决方案:

tidyr::spread

如您所见,library(tidyverse) mat.data[5] %>% rowid_to_column %>% mutate(value=1) %>% spread(cat,value, fill=0) %>% select(-rowid) # A B C # 1 1 0 0 # 2 1 0 0 # 3 0 0 1 # 4 0 1 0 不存在,但是如果D列中有任何"D",它将存在。