基于另一个矩阵创建矩阵包含元素索引

时间:2017-10-24 21:11:10

标签: r

我有一个矩阵,其中包含我希望创建它的矩阵元素的列ide的索引

>  index
         [,1] [,2] [,3] 
    [1,]    1    2    3    
    [2,]    1    2    5    
    [3,]    1    3    4    
    [4,]    1    3    5    
    [5,]    1    4    5    
    [6,]    2    3    5    
    [7,]    3    4    5    

示例第一行的列id为1,2,3,设置为值1

第二行的列id为1,2,5,设置为值1

now I want to create the following matrix:


     a1 a2 a3  a4 a5
[1,]  1  1  1  0  0
[2,]  1  1  0  0  1
[3,]  1  0  1  1  0
[4,]  1  0  1  0  1
[5,]  1  0  0  1  1  
[6,]  0  1  1  0  1
[7,]  0  0  1  1  1

数据

index <- rbind(c(1,2,3), c(1,2,5), c(1,3,4), c(1,3,5), c(1,4,5), c(2,3,5), c(3,4,5))

4 个答案:

答案 0 :(得分:5)

这是使用矩阵索引在评论中提到的极其快速和有效的基本R方法。

# construct 0 matrix with correct dimensions
newMat <- matrix(0L, nrow(myMat), max(myMat))

# fill in matrix using matrix indexing
newMat[cbind(c(row(myMat)), c(myMat))] <- 1L

返回

newMat
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    1    1    0    0
[2,]    1    1    0    0    1
[3,]    1    0    1    1    0
[4,]    1    0    1    0    1
[5,]    1    0    0    1    1
[6,]    0    1    1    0    1
[7,]    0    0    1    1    1

数据

myMat <-
structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 3L, 3L, 4L, 3L, 
4L, 3L, 5L, 4L, 5L, 5L, 5L, 5L), .Dim = c(7L, 3L))

答案 1 :(得分:2)

以下是基础R的解决方案:

# The original matrix with indices
m.idx <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), ncol = 3);

# For loop method
method.for_loop <- function(m) {
    m.val <- matrix(0, nrow = nrow(m), ncol = max(m));
    for (i in 1:nrow(m.idx)) m.val[i, m.idx[i, ]] <- 1;
    return(m.val);
}

# lapply method (@Headpoint)
method.lapply <- function(m) {
    m.val <- as.data.frame(matrix(0, nrow = nrow(m), ncol = max(m)));
    invisible(lapply(1:nrow(m),
                   function(x) m.val[x,][m[x,]] <<- 1));
    return(m.val);
}

# Direct indexing method (@lmo)
method.indexing <- function(m) {
    m.val <- matrix(0L, nrow(m.idx), max(m.idx));
    m.val[cbind(c(row(m.idx)), c(m.idx))] <- 1L;
    return(m.val);
}

# tidyr/dplyr method (@CPak)
method.dplyr_tidyr <- function(m) {
    as.data.frame(m) %>%
      gather() %>%                   # wide-to-long format
      group_by(key) %>%
      mutate(rn = row_number()) %>%  # add unique row_id per `key` group
      mutate(newval = 1) %>%         # fill in `existing` with this value
      ungroup() %>%                  # ungroup and unselect `key` group
      select(-key) %>%
      spread(value, newval, fill=0) %>%  # long-to-wide format
                                         # fill in `non-existing` with `0`
      select(-rn) %>%                    # unselect row_id column
      rename_all(funs(paste0("a", .)))   # rename columns
}

更新

请参阅下文,了解此处提供的所有方法的基准测试结果。我已将所有方法包装在函数

microbenchmark

library(microbenchmark); library(tidyr); library(dplyr); library(magrittr); res <- microbenchmark( for_loop = method.for_loop(m.idx), lapply = method.lapply(m.idx), indexing = method.indexing(m.idx), dplyr_tidyr = method.dplyr_tidyr(m.idx), times = 1000L ) print(res); # Unit: microseconds # expr min lq mean median uq max # for_loop 6.796 9.5405 16.89643 13.497 20.445 96.537 # lapply 1315.765 1441.5990 1696.74392 1518.256 1675.027 66181.880 # indexing 5.695 8.1450 20.49116 14.918 20.094 3139.946 # dplyr_tidyr 18777.669 20525.8095 22225.51936 21647.120 23215.714 84791.858 的结果如下:

lapply

结论:使用for循环或直接索引的方法是最紧密的。 tidyr是第二个,dplyr / file方法最慢(但请注意运行时的大幅增加)。

答案 2 :(得分:1)

没有for循环,但没有真正检查它是否更快。

index <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), 
                                   ncol = 3)
df <- as.data.frame(matrix(0, nrow = 7, ncol = 5))

invisible(lapply(1:nrow(index), 
                   function(x) df[x,][index[x,]] <<- 1))
df
# V1 V2 V3 V4 V5
# 1  1  1  1  0  0
# 2  1  1  0  0  1
# 3  1  0  1  1  0
# 4  1  0  1  0  1
# 5  1  0  0  1  1
# 6  0  1  1  0  1
# 7  0  0  1  1  1

答案 3 :(得分:0)

您可以结合使用dplyrtidyr

来执行此操作

您的数据

df <- read.table(text="1    2    3    
1    2    5    
1    3    4    
1    3    5    
1    4    5    
2    3    5    
3    4    5", header=FALSE)

解决方案:其中一些步骤是清理输出

df %>%
  gather() %>%                   # wide-to-long format
  group_by(key) %>%              
  mutate(rn = row_number()) %>%  # add unique row_id per `key` group
  mutate(newval = 1) %>%         # fill in `existing` with this value
  ungroup() %>%                  # ungroup and unselect `key` group
  select(-key) %>%
  spread(value, newval, fill=0) %>%  # long-to-wide format
                                     # fill in `non-existing` with `0`
  select(-rn) %>%                    # unselect row_id column
  rename_all(funs(paste0("a", .)))   # rename columns

输出

# A tibble: 7 x 5
     a1    a2    a3    a4    a5
* <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     0     0
2     1     1     0     0     1
3     1     0     1     1     0
4     1     0     1     0     1
5     1     0     0     1     1
6     0     1     1     0     1
7     0     0     1     1     1