我有一个矩阵,其中包含我希望创建它的矩阵元素的列ide的索引
> index
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 1 2 5
[3,] 1 3 4
[4,] 1 3 5
[5,] 1 4 5
[6,] 2 3 5
[7,] 3 4 5
示例第一行的列id为1,2,3,设置为值1
第二行的列id为1,2,5,设置为值1
now I want to create the following matrix:
a1 a2 a3 a4 a5
[1,] 1 1 1 0 0
[2,] 1 1 0 0 1
[3,] 1 0 1 1 0
[4,] 1 0 1 0 1
[5,] 1 0 0 1 1
[6,] 0 1 1 0 1
[7,] 0 0 1 1 1
数据
index <- rbind(c(1,2,3), c(1,2,5), c(1,3,4), c(1,3,5), c(1,4,5), c(2,3,5), c(3,4,5))
答案 0 :(得分:5)
这是使用矩阵索引在评论中提到的极其快速和有效的基本R方法。
# construct 0 matrix with correct dimensions
newMat <- matrix(0L, nrow(myMat), max(myMat))
# fill in matrix using matrix indexing
newMat[cbind(c(row(myMat)), c(myMat))] <- 1L
返回
newMat
[,1] [,2] [,3] [,4] [,5]
[1,] 1 1 1 0 0
[2,] 1 1 0 0 1
[3,] 1 0 1 1 0
[4,] 1 0 1 0 1
[5,] 1 0 0 1 1
[6,] 0 1 1 0 1
[7,] 0 0 1 1 1
数据强>
myMat <-
structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 3L, 3L, 4L, 3L,
4L, 3L, 5L, 4L, 5L, 5L, 5L, 5L), .Dim = c(7L, 3L))
答案 1 :(得分:2)
以下是基础R的解决方案:
# The original matrix with indices
m.idx <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), ncol = 3);
# For loop method
method.for_loop <- function(m) {
m.val <- matrix(0, nrow = nrow(m), ncol = max(m));
for (i in 1:nrow(m.idx)) m.val[i, m.idx[i, ]] <- 1;
return(m.val);
}
# lapply method (@Headpoint)
method.lapply <- function(m) {
m.val <- as.data.frame(matrix(0, nrow = nrow(m), ncol = max(m)));
invisible(lapply(1:nrow(m),
function(x) m.val[x,][m[x,]] <<- 1));
return(m.val);
}
# Direct indexing method (@lmo)
method.indexing <- function(m) {
m.val <- matrix(0L, nrow(m.idx), max(m.idx));
m.val[cbind(c(row(m.idx)), c(m.idx))] <- 1L;
return(m.val);
}
# tidyr/dplyr method (@CPak)
method.dplyr_tidyr <- function(m) {
as.data.frame(m) %>%
gather() %>% # wide-to-long format
group_by(key) %>%
mutate(rn = row_number()) %>% # add unique row_id per `key` group
mutate(newval = 1) %>% # fill in `existing` with this value
ungroup() %>% # ungroup and unselect `key` group
select(-key) %>%
spread(value, newval, fill=0) %>% # long-to-wide format
# fill in `non-existing` with `0`
select(-rn) %>% # unselect row_id column
rename_all(funs(paste0("a", .))) # rename columns
}
请参阅下文,了解此处提供的所有方法的基准测试结果。我已将所有方法包装在函数
中microbenchmark
library(microbenchmark);
library(tidyr);
library(dplyr);
library(magrittr);
res <- microbenchmark(
for_loop = method.for_loop(m.idx),
lapply = method.lapply(m.idx),
indexing = method.indexing(m.idx),
dplyr_tidyr = method.dplyr_tidyr(m.idx),
times = 1000L
)
print(res);
# Unit: microseconds
# expr min lq mean median uq max
# for_loop 6.796 9.5405 16.89643 13.497 20.445 96.537
# lapply 1315.765 1441.5990 1696.74392 1518.256 1675.027 66181.880
# indexing 5.695 8.1450 20.49116 14.918 20.094 3139.946
# dplyr_tidyr 18777.669 20525.8095 22225.51936 21647.120 23215.714 84791.858
的结果如下:
lapply
结论:使用for循环或直接索引的方法是最紧密的。 tidyr
是第二个,dplyr
/ file
方法最慢(但请注意运行时的大幅增加)。
答案 2 :(得分:1)
没有for
循环,但没有真正检查它是否更快。
index <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5),
ncol = 3)
df <- as.data.frame(matrix(0, nrow = 7, ncol = 5))
invisible(lapply(1:nrow(index),
function(x) df[x,][index[x,]] <<- 1))
df
# V1 V2 V3 V4 V5
# 1 1 1 1 0 0
# 2 1 1 0 0 1
# 3 1 0 1 1 0
# 4 1 0 1 0 1
# 5 1 0 0 1 1
# 6 0 1 1 0 1
# 7 0 0 1 1 1
答案 3 :(得分:0)
您可以结合使用dplyr
和tidyr
您的数据
df <- read.table(text="1 2 3
1 2 5
1 3 4
1 3 5
1 4 5
2 3 5
3 4 5", header=FALSE)
解决方案:其中一些步骤是清理输出
df %>%
gather() %>% # wide-to-long format
group_by(key) %>%
mutate(rn = row_number()) %>% # add unique row_id per `key` group
mutate(newval = 1) %>% # fill in `existing` with this value
ungroup() %>% # ungroup and unselect `key` group
select(-key) %>%
spread(value, newval, fill=0) %>% # long-to-wide format
# fill in `non-existing` with `0`
select(-rn) %>% # unselect row_id column
rename_all(funs(paste0("a", .))) # rename columns
输出
# A tibble: 7 x 5
a1 a2 a3 a4 a5
* <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 0
2 1 1 0 0 1
3 1 0 1 1 0
4 1 0 1 0 1
5 1 0 0 1 1
6 0 1 1 0 1
7 0 0 1 1 1