我有一个名为df的数据:(没有重复的df行)
a_id b_id
111111 18
111111 17
222222 18
333333 14
444444 13
555555 18
555555 24
222222 13
222222 17
333333 17
我希望将其反转为df_2这样的数据:
a_one a_two b_list number_of_b
222222 444444 13 1
111111 222222 17,18 2
111111 333333 17 1
111111 222222 17 1
222222 333333 17 1
111111 555555 18 1
222222 555555 18 1
如果a_id共享相同的b_id,则它们在df_2;
上成为一对df_2的b_list是相应的b_id;
number_of_b是b_list
的长度我有一个python代码
import pandas as pd
from itertools import combinations
df = df.groupby("b_id").apply(lambda x: list(combinations(x["a_id"], 2))).apply(pd.Series).stack()
df = df.apply(pd.Series).reset_index().groupby([0,1])["b_id"].apply(lambda x:x.values).reset_index()
df.columns = ["a_one", "a_two", "b_list"]
df["number_of_b"] = df.b_list.apply(len)
任何人都可以帮我在R
中实现它答案 0 :(得分:0)
我的方法有点长,但会给你想要的结果。 这是我的方法:
library(data.table)
mydf <- data.table(structure(list(a_id = c(111111L, 111111L, 222222L, 333333L, 444444L,
555555L, 555555L, 222222L, 222222L, 333333L), b_id = c(18L, 17L,
18L, 14L, 13L, 18L, 24L, 13L, 17L, 17L)), .Names = c("a_id",
"b_id"), class = "data.frame", row.names = c(NA, -10L)))
mydf <- mydf[mydf,.(a_id,a_id2=i.a_id,b_id),on="b_id",allow.cartesian=TRUE][a_id!=a_id2]
# Find duplicates
get_index <- function(string,values,current_index){
string_present <- match(string,values)
string_present[string_present<current_index] <- 0
return(string_present)
}
mydf[,c("first","reverse"):= .(paste0(a_id,", ",a_id2,", ",b_id),paste0(a_id2,", ",a_id,", ",b_id))]
mydf[,duplicate_index:= get_index(first,reverse,.I)]
mydf[duplicate_index==0,.(b_list=list(b_id),number_of_b=.N),.(a_id,a_id2)]
# a_id a_id2 b_list number_of_b
# 1: 111111 222222 18,17 2
# 2: 111111 555555 18 1
# 3: 222222 555555 18 1
# 4: 444444 222222 13 1
# 5: 111111 333333 17 1
# 6: 222222 333333 17 1
答案 1 :(得分:0)
使用base-R ......
df2 <- tapply(df$a_id, df$b_id, sort) #gather sorted a ids by b
df2 <- df2[sapply(df2, function(x) length(x)>1)] #remove single items
df2 <- stack(lapply(df2, function(x) apply(combn(x,2), 2, paste, collapse=" "))) #paste a's in pairs
df2 <- as.data.frame(tapply(df2$ind, df2$values, paste, collapse=",")) #gather b ids by a pairs
names(df2) <- "b_list"
df2[,c("a_one","a_two")] <- do.call(rbind,strsplit(rownames(df2)," ")) #create a columns from row names
df2$number_of_b <- sapply(df2$b_list,function(x) length(strsplit(x,",")[[1]]))
rownames(df2) <- NULL #remove row names
df2 <- df2[,c(2,3,1,4)] #reorder columns
df2
a_one a_two b_list number_of_b
1 111111 222222 17,18 2
2 111111 333333 17 1
3 111111 555555 18 1
4 222222 333333 17 1
5 222222 444444 13 1
6 222222 555555 18 1
答案 2 :(得分:0)
合并到自身,对列进行排序并删除重复项,然后总结:
library(dplyr)
merge(df1, df1, by = "b_id") %>%
transmute(a_one = pmin(a_id.x, a_id.y),
a_two = pmax(a_id.x, a_id.y),
b_id) %>%
filter(a_one != a_two) %>%
unique() %>%
group_by(a_one, a_two) %>%
summarise(b_list = paste(b_id, collapse = ","),
number_of_b = n())
# # A tibble: 6 x 4
# # Groups: a_one [?]
# a_one a_two b_list number_of_b
# <int> <int> <chr> <int>
# 1 111111 222222 17,18 2
# 2 111111 333333 17 1
# 3 111111 555555 18 1
# 4 222222 333333 17 1
# 5 222222 444444 13 1
# 6 222222 555555 18 1