映射多个值

时间:2015-02-24 19:21:59

标签: r

我需要像你这样的专家帮助解决问题,这对我的R技能来说太大了。

我有一个矢量和一个data.frame:

vec = c("v1;v2","v3","v4","v5;v6")

vecNames = c("v1","v2","v3","v4","v5","v6")
vecNames
## [1] "v1" "v2" "v3" "v4" "v5" "v6"

vecDescription = c("descr1","descr2","descr3","descr4","descr5","descr6")
vecDescription
## [1] "descr1" "descr2" "descr3" "descr4" "descr5" "descr6"

df = data.frame(vecNames, vecDescription)
df
  vecNames vecDescription
1       v1         descr1
2       v2         descr2
3       v3         descr3
4       v4         descr4
5       v5         descr5
6       v6         descr6

data.frame用于注释。

mapping = df$vecDescription[match(vec, df$vecNames)]

输出符合预期:

as.vector(mapping)
## [1] NA "descr3" "descr4" NA

但我想:

## [1] "descr1;descr2" "descr3" "descr4" "descr5;descr6"  

我使用for循环成功了,但是当应用于500k行时,这种方法非常慢。

6 个答案:

答案 0 :(得分:6)

另一个基础R解决方案:

  L <- strsplit(vec,split = ';')
  R <- as.character(df$vecDescription)[match(unlist(L),df$vecNames)]
  sapply(relist(R, L), paste, collapse=';')

和基准:

f.m <- function(vec,df) {
  L <- strsplit(vec,split = ';')
  R <- with(df,vecDescription[match(unlist(L),vecNames)])
  sapply(relist(R, L), paste, collapse=';')
}

f.m2 <- function(vec,df) {
  L <- strsplit(vec,split = ';')
  R <- as.character(df$vecDescription)[match(unlist(L),df$vecNames)]
  sapply(relist(R, L), paste, collapse=';')
}

f.j <- function(vec,df) {
  elts = strsplit(vec, ";")
  mapping = df$vecDescription[match(do.call(c, elts), df$vecNames)]
  tapply(mapping, rep(1:length(elts), sapply(elts, length)), 
         paste, collapse = ';')
}

f.da <- function(vec,df) {
  vec <- strsplit(vec, ";")
  sapply(vec, function(x) with(df, paste(vecDescription[vecNames %in% x], collapse = ";")))
}


f.da2 <- function(vec,df) {
  vapply(vec, function(x) with(df, paste(vecDescription[vecNames %in% x], collapse = ";")), character(1))
}

library(data.table)
library(reshape2)
f.eddi <- function(vec,df) {

  dt = as.data.table(df) # or use setDT to convert in place
   setkey(dt, vecNames)
   dt[melt(strsplit(vec, split = ";"))][,
                                       paste(vecDescription, collapse = ";"), by = L1][, V1]
}

f.eddi2 <- function(vec,df) {
  setkey(dt, vecNames)

  melt2 = function(l) data.table(value = unlist(l, use.names = F),
                                 L1    = unlist(lapply(seq_along(l), 
                                                       function(i) rep(i, length(l[[i]]))),
                                                use.names = F))
  dt[melt2(strsplit(vec, split = ";"))][,
                                       paste(vecDescription, collapse = ";"), by = L1][, V1]
}


f.Metrics <- function(vec,df) {
  x1<-strsplit(vec,";")
  x2<-data.frame(do.call(rbind,x1))
  x3<-df$vecDescription[df$vecNames %in% x2[,1]]
  x4<-df$vecDescription[df$vecNames %in% x2[,2]]
  sapply(1:length(x1),function(i){ifelse(x3[i]!=x4[i],paste(x3[i],x4[i],sep=";"),paste(x3[i]))})
}

df2 = data.frame(vecNames, vecDescription, stringsAsFactors = FALSE)

library('microbenchmark')
microbenchmark(f.m(vec,df), f.j(vec,df2), f.da(vec,df), f.da2(vec,df), f.eddi(vec,df))

结果:

Unit: microseconds
               expr      min        lq      mean    median        uq      max neval  cld
       f.m(vec, df)  186.414  218.6155  263.8829  231.8240  248.3900 2506.887   100  b  
      f.m2(vec, df)   94.751  113.4995  124.3000  122.1635  134.3795  195.045   100 a   
      f.j(vec, df2)  211.411  231.2145  254.2509  242.9275  261.9220  481.501   100  b  
      f.da(vec, df)  145.689  176.9130  199.1804  185.8020  195.6595 1383.394   100 ab  
     f.da2(vec, df)  117.027  140.6245  153.2124  150.5025  157.9735  298.111   100 ab  
    f.eddi(vec, df) 3396.690 3586.1695 3799.5835 3648.2905 3762.6335 6468.448   100    d
 f.Metrics(vec, df)  748.323  789.5460  881.9349  809.0135  833.5465 3335.045   100   c 

<强> [更新]

正如@eddi正确指出的那样,应该使用一个更大的数据集来实现更真实的基准测试,所以我们在这里:

n <- 1000
set.seed(1)
sample1 <- sample(n)
sample2 <- sample(n)
vec <- sapply(sample1, function(i) if (runif(1)>0.5) paste0('v',c(i,sample(n,size=1)),collapse=';') else paste0('v',i))
vecNames <- paste0('v', sample2)
vecDescription <- paste0('descr', sample2)
df = data.frame(vecNames, vecDescription)
df2 = data.frame(vecNames, vecDescription, stringsAsFactors = FALSE)

library('microbenchmark')

microbenchmark(f.m2(vec,df2),fj(vec,df2),f.da2(vec,df2),f.eddi2(vec,df2),f。Metrics(vec,df2))

结果:

Unit: milliseconds
               expr        min         lq       mean    median         uq       max neval   cld
       f.m(vec, df)  31.679775  35.682250  38.813526  38.53798  41.278268  50.94508   100  b   
      f.m2(vec, df)   8.384308   9.596091  10.833422  10.32222  10.954757  18.33386   100 a    
      f.j(vec, df2)   4.665586   5.216920   6.003011   5.65613   6.184318  12.32919   100 a    
      f.da(vec, df)  87.810338  94.419069  98.369134  96.63011 101.004672 165.76800   100   c  
     f.da2(vec, df)  84.199736  89.024529  94.053774  91.57543  94.448173 171.84077   100   c  
    f.eddi(vec, df) 276.079649 299.699244 314.580860 311.82896 329.421674 352.73114   100    d 
 f.Metrics(vec, df) 482.671849 496.465168 507.629372 505.23325 513.390346 594.13570   100     e

现在冠军是f.j(),比f.m2()快两倍,而其他功能则慢一个数量级。

[更新2]

在此基准测试中,n = 5000,并且所有函数都以df2作为输入(字符串是字符):

Unit: milliseconds
                expr        min         lq       mean     median         uq        max neval  cld
      f.m2(vec, df2)   44.97854   47.12005   51.13561   48.58260   55.11687   85.57911   100  b  
       f.j(vec, df2)   24.03023   26.03697   28.10994   27.09699   28.45757   39.77269   100 a   
     f.da2(vec, df2) 1150.06311 1236.57530 1276.34064 1269.03829 1296.79251 1583.44486   100    d
   f.eddi2(vec, df2)   65.88291   68.06959   72.89662   70.05462   76.19301  178.73181   100   c 
 f.Metrics(vec, df2)   54.54662   57.37777   59.95356   58.41737   62.15440   69.84452   100  b  

另一个基准,n = 50000:

Unit: milliseconds
                expr      min       lq     mean   median        uq       max neval cld
      f.m2(vec, df2) 551.7985 602.0489 659.5792 638.6707  685.9923 1135.1548   100  b 
       f.j(vec, df2) 340.2615 415.2678 454.9885 447.5994  494.9217  661.5898   100 a  
   f.eddi2(vec, df2) 833.3205 920.6528 979.3859 963.0641 1018.2014 1519.3684   100   c
 f.Metrics(vec, df2) 795.4200 895.8132 970.6516 954.8318 1001.6742 1427.0432   100   c

和最后一个,n = 500000:

Unit: seconds
                expr       min        lq      mean    median        uq       max neval  cld
      f.m2(vec, df2)  7.420941  7.645800  8.047706  7.978916  8.301547  9.134872    10  b  
       f.j(vec, df2)  5.043295  5.316371  5.925725  5.514834  6.288766  8.289737    10 a   
   f.eddi2(vec, df2) 11.190716 11.373425 12.144147 11.935814 12.487354 14.798366    10   c 
 f.Metrics(vec, df2) 13.086297 13.859301 14.143273 14.149004 14.524544 15.151098    10    d

答案 1 :(得分:5)

您需要执行以下操作:

df = data.frame(vecNames, vecDescription, stringsAsFactors = FALSE)
elts = strsplit(vec, ";")
mapping = df$vecDescription[match(do.call(c, elts), df$vecNames)]
tapply(mapping, rep(1:length(elts), sapply(elts, length)), 
       paste, collapse = ';')

请注意data.frame定义中的stringsAsFactors = FALSE。从根本上说,仍然有一个使用tapply的循环,但我不认为它可以被矢量化。

答案 2 :(得分:5)

library(data.table)
library(reshape2)

dt = as.data.table(df) # or use setDT to convert in place

setkey(dt, vecNames)

dt[melt(strsplit(vec, split = ";"))][,
   paste(vecDescription, collapse = ";"), by = L1][, V1]
#[1] "descr1;descr2" "descr3"        "descr4"        "descr5;descr6"

对于大数据,melt将成为瓶颈,您可以使用以下功能:

melt2 = function(l) data.table(value = unlist(l, use.names = F),
                               L1    = unlist(lapply(seq_along(l), 
                                                     function(i) rep(i, length(l[[i]]))),
                                              use.names = F))

答案 3 :(得分:5)

这是另一个基础R快速解决方案

vec <- strsplit(vec, ";")
sapply(vec, function(x) with(df, paste(vecDescription[vecNames %in% x], collapse = ";")))
## [1] "descr1;descr2" "descr3"        "descr4"        "descr5;descr6"

或者我们可以在

中使用vapply加快速度
vapply(vec, function(x) with(df, paste(vecDescription[vecNames %in% x], collapse = ";")), character(1))

答案 4 :(得分:2)

x1<-strsplit(vec,";")
x2<-data.frame(do.call(rbind,x1))
x3<-df$vecDescription[df$vecNames %in% x2[,1]]
x4<-df$vecDescription[df$vecNames %in% x2[,2]]
x5<-lapply(1:length(x1),function(i){ifelse(x3[i]!=x4[i],paste(x3[i],x4[i],sep=";"),paste(x3[i]))})

> x5
[[1]]
[1] "descr1;descr2"

[[2]]
[1] "descr3"

[[3]]
[1] "descr4"

[[4]]
[1] "descr5;descr6"

答案 5 :(得分:2)

Simpe使用我坚持的 qdap

library(qdap)
mgsub(vecNames, vecDescription, vec)

## [1] "descr1;descr2" "descr3"        "descr4"        "descr5;descr6"

如果您对 qdap 的开发版本进行基准测试,mgsub显着降低内存负担并且更快。这个简短的脚本将下载开发版:

if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh("trinker/qdap")