我有以下数据框:
╔══════╦═════════╗
║ Code ║ Airline ║
╠══════╬═════════╣
║ 1 ║ AF ║
║ 1 ║ KL ║
║ 8 ║ AR ║
║ 8 ║ AZ ║
║ 8 ║ DL ║
╚══════╩═════════╝
dat <- structure(list(Code = c(1L, 1L, 8L, 8L, 8L), Airline = structure(c(1L,
5L, 2L, 3L, 4L), .Label = c("AF ", "AR ", "AZ ", "DL", "KL "
), class = "factor")), .Names = c("Code", "Airline"), class = "data.frame", row.names = c(NA,
-5L))
我的目标是让每家航空公司查找所有共享代码,即一个或多个其他航空公司使用的代码。 所以输出将是
+--------------------+
| Airline SharedWith |
+--------------------+
| AF "KL" |
| KL "AF" |
| AR "AZ","DL" |
+--------------------+
伪代码是任何命令式语言
for each code
lookup all rows in the table where the value = code
由于R不是那么多面向列表的,那么实现预期输出的最佳方法是什么?
答案 0 :(得分:10)
使用data.table
包的几个选项:
1)使用strsplit
,paste
&amp;按行操作:
library(data.table)
setDT(dat)[, Airline := trimws(Airline) # this step is needed to remove the leading and trailing whitespaces
][, sharedwith := paste(Airline, collapse = ','), Code
][, sharedwith := paste(unlist(strsplit(sharedwith,','))[!unlist(strsplit(sharedwith,',')) %in% Airline],
collapse = ','), 1:nrow(dat)]
给出:
> dat
Code Airline sharedwith
1: 1 AF KL
2: 1 KL AF
3: 8 AR AZ,DL
4: 8 AZ AR,DL
5: 8 DL AR,AZ
2)使用strsplit
&amp; paste
mapply
代替by = 1:nrow(dat)
:
setDT(dat)[, Airline := trimws(Airline)
][, sharedwith := paste(Airline, collapse = ','), Code
][, sharedwith := mapply(function(s,a) paste(unlist(strsplit(s,','))[!unlist(strsplit(s,',')) %in% a],
collapse = ','),
sharedwith, Airline)][]
会给你相同的结果。
3)或者将CJ
函数与paste
一起使用(受@ zx8754 expand.grid
解决方案的启发):
library(data.table)
setDT(dat)[, Airline := trimws(Airline)
][, CJ(air=Airline, Airline, unique=TRUE)[air!=V2][, .(shared=paste(V2,collapse=',')), air],
Code]
给出:
Code air shared
1: 1 AF KL
2: 1 KL AF
3: 8 AR AZ,DL
4: 8 AZ AR,DL
5: 8 DL AR,AZ
dplyr
&amp;的解决方案tidyr
获得所需的解决方案(受@jaimedash启发):
library(dplyr)
library(tidyr)
dat <- dat %>% mutate(Airline = trimws(as.character(Airline)))
dat %>%
mutate(SharedWith = Airline) %>%
group_by(Code) %>%
nest(-Code, -Airline, .key = SharedWith) %>%
left_join(dat, ., by = 'Code') %>%
unnest() %>%
filter(Airline != SharedWith) %>%
group_by(Code, Airline) %>%
summarise(SharedWith = toString(SharedWith))
给出:
Code Airline SharedWith
(int) (chr) (chr)
1 1 AF KL
2 1 KL AF
3 8 AR AZ, DL
4 8 AZ AR, DL
5 8 DL AR, AZ
答案 1 :(得分:9)
一个igraph
方法
library(igraph)
g <- graph_from_data_frame(dat)
# Find neighbours for select nodes
ne <- setNames(ego(g,2, nodes=as.character(dat$Airline), mindist=2), dat$Airline)
ne
#$`AF `
#+ 1/7 vertex, named:
#[1] KL
#$`KL `
#+ 1/7 vertex, named:
#[1] AF
---
---
# Get final format
data.frame(Airline=names(ne),
Shared=sapply(ne, function(x)
paste(V(g)$name[x], collapse=",")))
# Airline Shared
# 1 AF KL
# 2 KL AF
# 3 AR AZ,DL
# 4 AZ AR,DL
# 5 DL AR,AZ
答案 2 :(得分:8)
我认为您需要的只是table
dat <- structure(list(Code = c(1L, 1L, 8L, 8L, 8L),Airline = structure(c(1L, 5L, 2L, 3L, 4L),.Label = c("AF", "AR", "AZ", "DL", "KL"),class = "factor")),.Names = c("Code", "Airline"),class = "data.frame", row.names = c(NA, -5L))
tbl <- crossprod(table(dat))
diag(tbl) <- 0
# Airline
# Airline AF AR AZ DL KL
# AF 0 0 0 0 1
# AR 0 0 1 1 0
# AZ 0 1 0 1 0
# DL 0 1 1 0 0
# KL 1 0 0 0 0
dd <- data.frame(Airline = colnames(tbl),
shared = apply(tbl, 1, function(x)
paste(names(x)[x > 0], collapse = ', ')))
merge(dat, dd)
# Airline Code shared
# 1 AF 1 KL
# 2 AR 8 AZ, DL
# 3 AZ 8 AR, DL
# 4 DL 8 AR, AZ
# 5 KL 1 AF
答案 3 :(得分:6)
可能会有更有效的路线,但这应该会飞:
# example data
d <- data.frame(code = c(1,1,8,8,8),
airline = c("AF","KL","AR","AZ","DL"),
stringsAsFactors = FALSE)
# merge d to itself on the code column. This isn't necessarily efficient
d2 <- merge(d, d, by = "code")
# prune d2 to remove occasions where
# airline.x and airline.y (from the merge) are equal
d2 <- d2[d2[["airline.x"]] != d2[["airline.y"]], ]
# construct the combinations for each airline using a split, apply, combine
# then, use stack to get a nice structure for merging
d2 <- stack(
lapply(split(d2, d2[["airline.x"]]),
function(ii) paste0(ii$airline.y, collapse = ",")))
# merge d and d2. "ind" is a column produced by stack
merge(d, d2, by.x = "airline", by.y = "ind")
# airline code values
#1 AF 1 KL
#2 AR 8 AZ,DL
#3 AZ 8 AR,DL
#4 DL 8 AR,AZ
#5 KL 1 AF
答案 4 :(得分:5)
使用expand.grid和aggregate:
do.call(rbind,
lapply(split(dat, dat$Code), function(i){
x <- expand.grid(i$Airline, i$Airline)
x <- x[ x$Var1 != x$Var2, ]
x <- aggregate(x$Var2, list(x$Var1), paste, collapse = ",")
colnames(x) <- c("Airline", "SharedWith")
cbind(Code = i$Code, x)
}))
# output
# Code Airline SharedWith
# 1.1 1 AF KL
# 1.2 1 KL AF
# 8.1 8 AR AZ,DL
# 8.2 8 AZ AR,DL
# 8.3 8 DL AR,AZ
答案 5 :(得分:4)
split
有帮助。这是一个完全可重现的EDIT,无需任何额外的包。使用OPs data.frame - 在OP添加可重现的数据集后更改它。
# strip white space in Airline names:
dat$Airline <- gsub(" ","",dat$Airline)
li <- split(dat,factor(dat$Code))
do.call("rbind",lapply(li,function(x)
data.frame(Airline = x[1,2],
SharedWith = paste(x$Airline[-1]
,collapse=",")
))
)
答案 6 :(得分:2)
你可以在dplyr
library(dplyr)
df %>% group_by(code) %>% mutate(SharedWith = paste(sort(Airline), collapse = ', ')) %>% ungroup() %>% select(Airline, SharedWith)
答案 7 :(得分:1)
将以下内容作为评论发布作为答案,因为这样可以更方便地进行格式化。
for each code
lookup all rows in the table where the value = code
嗯...对不起,我不知道这个psedudocode与你想要的输出有什么关系
+--------------------+
| Airline SharedWith |
+--------------------+
| AF "KL" |
| KL "AF" |
| AR "AZ","DL" |
+--------------------+
这个伪代码的结果应该是:
+---------------------+
+ Code + Airlines +
+---------------------+
+ 1 + AF, KL +
+ 2 + AR, AZ, DL +
+---------------------+
即,
codes <- unique(dat$Code)
data.frame(Code=codes, Airlines = sapply(codes, function(x) paste(subset(dat, Code %in% x)$Airline, collapse=",")))
答案 8 :(得分:0)
您可以使用tidyr
nest
快速执行此操作(尽管除非您首先将航空公司翻译为角色因素,否则速度较快)merge
< / p>
library(tidyr)
dat$Airline <- as.character(dat$Airline)
new_dat <- merge(dat, dat %>% nest(-Code, .key= SharedWith), by="Code")
和
> new_dat
Code Airline SharedWith
1 1 AF AF, KL
2 1 KL AF, KL
3 8 AR AR, AZ, DL
4 8 AZ AR, AZ, DL
5 8 DL AR, AZ, DL
此解决方案的优势优于其他一些解决方案:SharedWith
成为data.frame
的列表列,而不是说出一个字符
> str(new_dat$SharedWith)
List of 5
$ :'data.frame': 2 obs. of 1 variable:
..$ Airline: chr [1:2] "AF" "KL"
$ :'data.frame': 2 obs. of 1 variable:
..$ Airline: chr [1:2] "AF" "KL"
$ :'data.frame': 3 obs. of 1 variable:
..$ Airline: chr [1:3] "AR" "AZ" "DL"
$ :'data.frame': 3 obs. of 1 variable:
..$ Airline: chr [1:3] "AR" "AZ" "DL"
$ :'data.frame': 3 obs. of 1 variable:
..$ Airline: chr [1:3] "AR" "AZ" "DL"
因此您可以轻松地(albiet not prettily)索引共享值的向量,例如:
> new_dat$SharedWith[[1]]$Airline
[1] "AF" "KL"
而不是必须使用strsplit
或类似的