我有两个数据框:
> a
box hits
1 px085 agx|amx|app
2 px075 gxz|gpx|amr
3 px065 abc|apr|ppy
4 rx055 alo|amx|bbc
5 rx088 ppy|pxg|ptr
6 rx099 prt|ppm|zee
> b
hitcode appid
1 agx 12485
2 abc 18550
3 bbc 19225
4 ppy 15260
5 zee 16880
我试图获得输出:
box hits appcode
1 px085 agx|amx|app 12485
2 px075 gxz|gpx|amr
3 px065 abc|apr|ppy 18550
4 rx055 alo|amx|bbc 19225
5 rx088 ppy|pxg|ptr 15260
6 rx099 prt|ppm|zee 16880
我试过了:
gcode <- function(x){
b[grep(x, b$hitcode, ignore.case = TRUE, perl = TRUE), c('appid')]
}
哪位给了我:
> gcode(a$hits)
#[1] 12485
#Warning message:
#In grep(x, b$hitcode, ignore.case = TRUE, perl = TRUE) :
# argument 'pattern' has length > 1 and only the first element will be used
我在这里缺少什么?
答案 0 :(得分:1)
根据评论,您的示例允许多个应用与您的hitcodes匹配。这是一个使用循环的解决方案,如果存在多个匹配,则appid
不会被覆盖。
我假设您的角色变量被格式化为因子。否则,1:nlevels(b$hitcode)
变为1:length(b$hitcode)
。
a$appid <- as.character(NA)
for(i in 1:nlevels(b$hitcode)){
cur <- b$hitcode[i]
hit <- grep(cur, a$hits)
app <- b$appid[i]
na <- is.na(a$appid[hit])
a$appid[ hit[na] ] <- app
a$appid[ hit[!na] ] <- paste(a$appid[ hit[!na] ],app,sep=";")
}
这给出了:
# > a
# box hits appid
# 1 px085 agx|amx|app 12485
# 2 px075 gxz|gpx|amr <NA>
# 3 px065 abc|apr|ppy 18550;15260
# 4 rx055 alo|amx|bbc 19225
# 5 rx088 ppy|pxg|ptr 15260
# 6 rx099 prt|ppm|zee 16880
答案 1 :(得分:1)
你可以尝试:
library(dplyr)
library(tidyr)
library(stringi)
a %>%
separate(hits, into = paste(1:3), remove = FALSE) %>%
gather(key, value, -box, -hits) %>%
left_join(., b, by = c("value" = "hitcode")) %>%
group_by(box, hits) %>%
summarise(appid = toString(appid) %>% stri_extract_all(., regex = "[:digit:]+"))
这会将appid
结果存储在您稍后可以访问的列表中
#Source: local data frame [6 x 3]
#Groups: box
#
# box hits appid
#1 px065 abc|apr|ppy <chr[2]>
#2 px075 gxz|gpx|amr <chr[1]>
#3 px085 agx|amx|app <chr[1]>
#4 rx055 alo|amx|bbc <chr[1]>
#5 rx088 ppy|pxg|ptr <chr[1]>
#6 rx099 prt|ppm|zee <chr[1]>
<强>结构强>
#Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 6 obs. of 3 variables:
# $ box : chr "px065" "px075" "px085" "rx055" ...
# $ hits : chr "abc|apr|ppy" "gxz|gpx|amr" "agx|amx|app" "alo|amx|bbc" ...
# $ appid:List of 6
# ..$ : chr "18550" "15260"
# ..$ : chr NA
# ..$ : chr "12485"
# ..$ : chr "19225"
# ..$ : chr "15260"
# ..$ : chr "16880"
# - attr(*, "vars")=List of 1
# ..$ : symbol box
# - attr(*, "drop")= logi TRUE
答案 2 :(得分:1)
此处尝试使用data.table
library(data.table)
indx <- setDT(a)[, grep(hits, b$hitcode), by = box]
indx2 <- setDT(b)[indx$V1, .(indx$box, appid)][, .(toString(appid)), by = .(box = V1)]
setkey(a, box)
a[indx2, appid := i.V1]
a
# box hits appid
# 1: px065 abc|apr|ppy 18550, 15260
# 2: px075 gxz|gpx|amr NA
# 3: px085 agx|amx|app 12485
# 4: rx055 alo|amx|bbc 19225
# 5: rx088 ppy|pxg|ptr 15260
# 6: rx099 prt|ppm|zee 16880