我的数据:
data1 <- data.frame(from = c(1, 2, 13, 4),
to = c(4, 3, 9, 1),
values = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
我的结果:
from to values
1 4 12
2 3 56
13 9 67
4 1 78
place id
NY 1
London 2
Brest 3
Nantes 4
我期望使用dplyr包中的join函数(在新表中)
from to values
NY Nantes 12
London Brest 56
London NY 78
我尝试了什么:
data3<- inner_join (data1, data2, by =c("from" = "id", "to" = "id"))
data3
一些参考文献:
https://stat545-ubc.github.io/bit001_dplyr-cheatsheet.html
https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
混合数据
的更好示例考虑到我有50列地理数据(“地点”)和非地理数据(等级,值) 我不想改变我的d.f.列的顺序 我想保留我的专栏名称
data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
value1 = c(4, 3, 9, 1),
firstplace = c(1, 2, 13, 4),
secondplace = c(1, 2, 2, 4),
value2 = c(78, 3000, 90, 101),
thirdplace =c(1, 1, 2, 4),
fourthplace=c(4, 4, 4, 4),
fifthplace=c(1, 2, 3, 4),
value3 = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
具有不同名称的示例(更复杂?)
我不想改变我的d.f.列的顺序 我想保留我的专栏名称
data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
value1 = c(4, 3, 9, 1),
shops= c(1, 2, 13, 4),
after_sales_service = c(1, 2, 2, 4),
value2 = c(78, 3000, 90, 101),
provider =c(1, 1, 2, 4),
seller=c(4, 4, 4, 4),
maker=c(1, 2, 3, 4),
value3 = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
答案 0 :(得分:4)
您可以使用data2
作为查找表,而不是加入:
library(dplyr)
data1 <- data1 %>%
mutate(from = data2$place[match(from, data2$id)],
to = data2$place[match(to, data2$id)]) %>%
filter(complete.cases(.))
给出:
> data1
from to values
1 NY Nantes 12
2 London Brest 56
3 Nantes NY 78
使用 data.table 包的替代解决方案:
library(data.table)
na.omit(setDT(data1)[, `:=` (from = data2$place[match(from, data2$id)],
to = data2$place[match(to, data2$id)])])
你也可以做一个双left_join
:
data1 %>%
left_join(., data2, by = c("from"="id")) %>%
left_join(., data2, by = c("to"="id")) %>%
select(-c(1:2)) %>%
filter(complete.cases(.))
更新1:如果您有多个名称必须匹配的列,最好先将数据帧转换为长格式。更大数据集的示例:
library(dplyr)
library(tidyr)
data1 %>%
gather(var, val, -values) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-3) %>%
filter(!is.na(place)) %>%
spread(var, place)
给出:
values fifthplace firstplace fourthplace from secondplace thirdplace to
1 12 NY NY Nantes NY NY NY Nantes
2 56 London London Nantes London London NY Brest
3 67 Brest <NA> Nantes <NA> London London <NA>
4 78 Nantes Nantes Nantes Nantes Nantes Nantes NY
使用 data.table ,您可以:
library(data.table)
dcast(melt(setDT(data1),
id.vars = "values")[data2, on = c(value="id")],
values ~ variable, value.var = "place")
给你相同的结果。
更新2:为了回应问题的第二次更新,您可以使用 dplyr / tidyr 按照以下方式进行操作:
data1 %>%
gather(var, val, c(firstplace,secondplace,thirdplace,fourthplace,fifthplace)) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-val) %>%
spread(var, place)
给出:
levels1 value1 value2 value3 fifthplace firstplace fourthplace secondplace thirdplace
1 name1 4 78 12 NY NY Nantes NY NY
2 name2 3 3000 56 London London Nantes London NY
3 name3 9 90 67 Brest <NA> Nantes London London
4 name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
或使用 data.table :
mvars <- c("firstplace","secondplace","thirdplace","fourthplace","fifthplace")
dcast(melt(setDT(data1),
measure.vars = mvars)[data2, on = c(value="id")],
levels1 + value1 + value2 + value3 ~ variable, value.var = "place")
给出相同的结果:
levels1 value1 value2 value3 firstplace secondplace thirdplace fourthplace fifthplace
1: name1 4 78 12 NY NY NY Nantes NY
2: name2 3 3000 56 London London NY Nantes London
3: name3 9 90 67 NA London London Nantes Brest
4: name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
更新3:如果您想使用索引号,可以执行以下操作:
# dplyr / tidyr
data1 %>%
gather(var, val, c(3,4,6:8)) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-val) %>%
spread(var, place)
# data.table
dcast(melt(setDT(data1),
measure.vars = c(3,4,6:8))[data2, on = c(value="id")],
levels1 + value1 + value2 + value3 ~ variable, value.var = "place")
给出( data.table 输出):
levels1 value1 value2 value3 shops after_sales_service provider seller maker
1: name1 4 78 12 NY NY NY Nantes NY
2: name2 3 3000 56 London London NY Nantes London
3: name3 9 90 67 NA London London Nantes Brest
4: name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
答案 1 :(得分:2)
我们可以使用base R
merge(merge(data1, data2, by.x='from', by.y= 'id'),
data2, by.x='to', by.y='id')[-(1:2)]
对于您的新数据集,我们可以在将数据集转换为match
而不使用“值”列时使用matrix
temp <- as.data.frame(`dim<-`(as.character(data2$place[
match(as.matrix(data1[-ncol(data1)]), data2$id)]),
dim(data1[-ncol(data1)])))
names(temp) <- head(names(data1),-1)
cbind(data1[ncol(data1)], temp)
# values from to firstplace secondplace thirdplace fourthplace fifthplace
#1 12 NY Nantes NY NY NY Nantes NY
#2 56 London Brest London London NY Nantes London
#3 67 <NA> <NA> <NA> London London Nantes Brest
#4 78 Nantes NY Nantes Nantes Nantes Nantes Nantes
基于OP帖子中的新更新
i1 <- grep('place', names(data1))
d1 <- as.data.frame(`dim<-`(as.character(data2$place[
match(as.matrix(data1[i1]), data2$id)]),
dim(data1[i1])), stringsAsFactors=FALSE)
d2 <- cbind(data1[-i1], setNames(d1, paste0('place', 1:ncol(d1))))
d2
# levels1 value1 value2 value3 place1 place2 place3 place4 place5
#1 name1 4 78 12 NY NY NY Nantes NY
#2 name2 3 3000 56 London London NY Nantes London
#3 name3 9 90 67 <NA> London London Nantes Brest
#4 name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
如果列名不同,只需更改第2步
d2 <- cbind(data1[-i1], setNames(d1, names(data1[i1])))