当前,我有两个数据帧。第一个df1包含两列,代表网络联系。另一个df2仅包含一列,列出了我拥有属性数据的所有案例。
我想从df1中的df2中搜索那些案例,如果它们不在df1中的一列或两列中,那么我想删除该行。因此,我将得到一个df1,其中包含我拥有属性数据的案例之间的网络联系。
df1具有大约240万个关系(边缘列表),而df2具有3.4万个单独案例。
这是我在谷歌搜索一段时间后尝试执行的操作:
首先,我在df1上复制两列以进行备份。
library*dlypr)
df3<- df1%>%
mutate(friendid = friendid %in% df2$V1,
friendid = friendid*1.0) #converts boolean to numeric
df3<- df3%>%
mutate(tieid= tieid%in% df2$V1,
tieid= tieid*1.0)
#So what I think is going on here is that if the number is not found it turn the value into 0 and 1 if present. I do this for the two original columns from df1.
#Then I attempt to delete the rows by searching for 0 values on each column (2 and 3, which contain the edgelist).
df3<-df3[apply(df3[2],1,function(z) !any(z==0)),]
df3<-df3[apply(df3[3],1,function(z) !any(z==0)),]
该过程确实删除了一堆行,但最终导致大约2k例。这似乎是不对的。我尝试了一种类似的方法来执行Excel,但是它对您一次可以加载的行数有限制。将数据集分为三个不同的文件并使用Kutools之后,我最终得到了约74k个案例。但是由于我做了很多手工工作,所以我可以肯定Excel工作中有错误。 R允许我一次加载所有数据,这将有助于获得更具体的结果。
任何帮助将不胜感激。谢谢
已编辑以提供更多信息:
> head(df1)
ID steamid friendid daysknown years el1 el2
1 NA 7.65612e+16 7.65612e+16 2156 5.902806 7.65612e+16 7.65612e+16
2 NA 7.65612e+16 7.65612e+16 3480 9.527721 7.65612e+16 7.65612e+16
3 NA 7.65612e+16 7.65612e+16 1588 4.347707 7.65612e+16 7.65612e+16
4 NA 7.65612e+16 7.65612e+16 501 1.371663 7.65612e+16 7.65612e+16
5 NA 7.65612e+16 7.65612e+16 858 2.349076 7.65612e+16 7.65612e+16
6 NA 7.65612e+16 7.65612e+16 686 1.878166 7.65612e+16 7.65612e+16
> head(df2)
V1
1 76561197960265800
2 76561197960266000
3 76561197960266100
4 76561197960267800
5 76561197960268100
6 76561197960268400
df1中的Steamid和Friendid两列都必须是df2 $ V1中可用的ID。如果一对中仅存在一个ID,则该行也必须删除,如果也不存在。结束df将具有只能在df2中找到的ID对。
答案 0 :(得分:0)
您可以执行以下操作:
df2$flag <- 1 #create a lookup column
df_temp <- merge(df1, df2, by.x = "friendid", by.y = "V1", all.x = TRUE)
names(df_temp) <- c("friendid", "tieid", "flag_1")
df_new <- merge(df_temp, df2, by.x = "tieid", by.y = "V1", all.x = TRUE)
names(df_temp) <- c("friendid", "tieid", "flag_1", "flag_2")
df_final <- subset(df_new, df_new$flag_1 == 1 | df_new$flag_2 == 1)
首先,您正在尝试检查df1和df2中匹配的朋友。然后,您要检查新数据帧和df2之间匹配的tieid。然后,您将子集新创建的数据框,以仅保留其中一个为1的行
答案 1 :(得分:0)
您好Juan Juan Arroyo Flores,欢迎来到stackoverflow。
我不确定我是否正确,但是我认为您可以使用%in%运算符来解决此问题。
df $ variable1%in%df2 $ variable将检查df $ variable1中的每个元素(如果存在于df2 $ variable中)。
df1 = data.frame("name1" = c("a", "b", "c", "d"), "name2" = c("f", "g", "h", "i"), stringsAsFactors = F)
df2 = data.frame("names" = c("a", "g", "i"), stringsAsFactors = F)
df1
df2
# name1 name2
# 1 a f
# 2 b g
# 3 c h
# 4 d i
# > df2
# names
# 1 a
# 2 g
# 3 i
# so we want to have row 1 selecet (cause of a), row 2 (caus of g) and row 4 (caus of i)
# row 3 gets deleated
# lets use %in%
df1$name1 %in% df2$names
# > df1$name1 %in% df2$names
# [1] TRUE FALSE FALSE FALSE
df1$name2 %in% df2$names
# > df1$name2 %in% df2$names
# [1] FALSE TRUE FALSE TRUE
# to combine both a or is needed
df1$name1 %in% df2$names | df1$name2 %in% df2$names
# > df1$name1 %in% df2$names | df1$name2 %in% df2$names
# [1] TRUE TRUE FALSE TRUE
# with which you can select the index
select_index = which(df1$name1 %in% df2$names | df1$name2 %in% df2$names)
select_index
# > select_index
# [1] 1 2 4
# now this can be used to select the desired rows
df1[select_index,]
# > df1[select_index,]
# name1 name2
# 1 a f
# 2 b g
# 4 d i
# you could as well just use
df1[df1$name1 %in% df2$names | df1$name2 %in% df2$names,]
# > df1[df1$name1 %in% df2$names | df1$name2 %in% df2$names,]
# name1 name2
# 1 a f
# 2 b g
# 4 d i
or with dplyr
filter(df1, name1 %in% df2$names | name2 %in% df2$names)
# > filter(df1, name1 %in% df2$names | name2 %in% df2$names)
# name1 name2
# 1 a f
# 2 b g
# 3 d i
不确定这是您要寻找的吗?
答案 2 :(得分:0)
这就是我最终不确定是否正确的内容。但是在SmitM和TinglTanglBob的代码的帮助下,我想到了:
#This looks for the id number on the steamid column and returns a new variable tf1 with a logical value of T or F. The same goes for the friendid column returning results to tf2
df1$tf1<-df1$steamid %in% df2$V1
df1$tf2<-df1$friendid %in% df2$V1
#The I do two subsets, first a subset of df1 where tf1= TRUE and then a second subest out of that one where tf2= TRUE
df3<-subset(df1,subset = tf1 %in% 'TRUE' & tf2 %in% 'TRUE')
df4<-subset(df3,subset=tf2 %in% 'TRUE')
可悲的是,我结束时的数据比我想象的要少得多。至少如果我做对了。