在R中,我有多个非常大的(大约140e6)IP地址列表。多个列表之间存在许多重叠IP。我想创建一个数据框或数据表,其中包含作为rowname的ip地址(没有重复项),列表名称作为列,0或1表示该列表中是否存在ip。
例如,我们有以下两个列表,两者之间有一些%的交集。
a <- c("192.168.0.1","192.168.0.2","192.168.0.3","192.168.0.4","192.168.0.5","192.168.0.6","192.168.0.7","192.168.0.8","192.168.0.9","192.168.0.10")
b <- c("192.168.1.1","192.168.1.2","192.168.1.3","192.168.1.4","192.168.0.5","192.168.0.6","192.168.0.7","192.168.0.8","192.168.0.9","192.168.0.10")
我想要的是:
a b
192.168.0.1 1 0
192.168.0.2 1 0
192.168.0.3 1 0
192.168.0.4 1 0
192.168.0.5 1 1
192.168.0.6 1 1
192.168.0.7 1 1
192.168.0.8 1 1
192.168.0.9 1 1
192.168.0.10 1 1
192.168.1.1 0 1
192.168.1.2 0 1
192.168.1.3 0 1
192.168.1.4 0 1
我尝试过使用reshape2,tidyr,model.matrix,intersect和good ol'for循环。我发现了一些人们从数据框创建虚拟矩阵的例子,但没有将矢量名称作为列,值不作为rowname,而不是重复项。
答案 0 :(得分:2)
首先,我将介绍2个新解决方案
合并解决方案
df1 <- merge(data.frame(ip=a,a=1), data.frame(ip=b,b=1),all=TRUE) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip) %>% replace(.,is.na(.),0)
# a b
# 192.168.0.1 1 0
# 192.168.0.10 1 1
# 192.168.0.2 1 0
# 192.168.0.3 1 0
# 192.168.0.4 1 0
# 192.168.0.5 1 1
# 192.168.0.6 1 1
# 192.168.0.7 1 1
# 192.168.0.8 1 1
# 192.168.0.9 1 1
# 192.168.1.1 0 1
# 192.168.1.2 0 1
# 192.168.1.3 0 1
# 192.168.1.4 0 1
这里也是重塑的解决方案
关于这个问题的一个很酷的事情是当你有两个以上的源向量时它会起作用:
df2 <- list(data.frame(a),data.frame(b)) %>%
lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
do.call(rbind,.) %>%
transform(v=1) %>%
reshape(idvar="ip",timevar="source",direction="wide",sep="") %>%
replace(.,is.na(.),0) %>%
setNames(gsub("v","",colnames(.))) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip)
# a b
# 192.168.0.1 1 0
# 192.168.0.2 1 0
# 192.168.0.3 1 0
# 192.168.0.4 1 0
# 192.168.0.5 1 1
# 192.168.0.6 1 1
# 192.168.0.7 1 1
# 192.168.0.8 1 1
# 192.168.0.9 1 1
# 192.168.0.10 1 1
# 192.168.1.1 0 1
# 192.168.1.2 0 1
# 192.168.1.3 0 1
# 192.168.1.4 0 1
2个载体的所有解决方案的基准
让我们对迄今为止提供的解决方案进行基准测试。我使用data.table
添加了我的第一个解决方案的变体,并使用dcast
中的reshape2
和来自spread
的{{1}}
tidyR
对于给定的例子:
microbenchmark(
merge = merge(data.frame(ip=a,a=1), data.frame(ip=b,b=1),all=TRUE) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip) %>% replace(.,is.na(.),0),
merge_dt = merge(data.table(ip=a,a=1,key="ip"), data.table(ip=b,b=1,key="ip"),all=TRUE) %>%
as.data.frame %>% # to go back to desired output format
set_rownames(.,`[`(.,,'ip')) %>% select(-ip) %>% replace(.,is.na(.),0),
dcast = list(data.frame(a),data.frame(b)) %>%
lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
do.call(rbind,.) %>%
transform(v=1) %>%
dcast(ip ~ source,value.var="v") %>%
replace(.,is.na(.),0) %>%
setNames(gsub("v","",colnames(.))) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip),
spread = list(data.frame(a),data.frame(b)) %>%
lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
do.call(rbind,.) %>%
transform(v=1) %>%
spread(source,v) %>%
replace(.,is.na(.),0) %>%
setNames(gsub("v","",colnames(.))) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip),
reshape = list(data.frame(a),data.frame(b)) %>%
lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
do.call(rbind,.) %>%
transform(v=1) %>%
reshape(idvar="ip",timevar="source",direction="wide",sep="") %>%
replace(.,is.na(.),0) %>%
setNames(gsub("v","",colnames(.))) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip),
akrun = {lvl <- unique(c(a,b));mapply(table, list(a = factor(a, levels = lvl),b = factor(b, levels = lvl)))},
p_routh = {df <- data.frame("IP" = unique(c(a,b)));df2 <- df%>%mutate(a = ifelse(df$IP %in% a,1,0),b = ifelse(df$IP %in% b,1,0))},
d.b = {ALL <- unique(c(a,b));data.frame(sapply(list(a = a, b = b), function(x) as.numeric(ALL %in% x)), row.names = ALL)},
times = 100
)
更大的例子: 140E6有点基准,所以我尝试使用1E5。我随意选择a和b之间约50%的重叠。
# Unit: microseconds
# expr min lq mean median uq max neval
# merge 2368.754 2670.8205 3866.2288 2942.6280 3685.1415 38459.947 100
# merge_dt 4220.084 4702.4700 5547.1978 5222.3705 6239.1685 9170.293 100
# dcast 6153.875 6870.3760 9031.8770 7521.7570 8793.9045 46529.917 100
# spread 4329.090 4814.6610 6023.5993 5313.3275 6301.9890 38972.416 100
# reshape 4376.514 5007.1905 5995.1480 5694.1395 6811.4495 8744.180 100
# akrun 238.893 304.3680 366.0376 327.7265 416.3815 654.744 100
# p_routh 1013.967 1190.9255 1418.8037 1296.7450 1651.7220 2162.775 100
# d.b 133.072 183.8595 228.7220 207.0415 278.1780 417.974 100
我运行基准10次
n <- 1E5
set.seed(1)
a <- sample(2*n,n)
b <- sample(2*n,n)
我们看到P Routh的解决方案对于2个向量来说是最快的,# Unit: milliseconds
# expr min lq mean median uq max neval
# merge 582.41885 617.4348 676.40615 651.84618 698.1091 911.8320 10
# merge_dt 98.72318 100.6648 114.72754 103.57925 119.9722 176.5360 10
# dcast 267.51729 347.8337 366.85554 360.17472 411.5002 454.1912 10
# spread 425.26005 447.7959 471.03577 477.02525 490.0484 502.8333 10
# reshape 697.14005 738.6921 763.31876 751.01547 791.3207 818.0778 10
# akrun 791.00964 815.5621 838.08296 832.31382 849.5231 923.6849 10
# p_routh 78.77724 82.8646 98.38296 84.34238 101.7304 151.0339 10
# d.b 191.00546 194.5754 209.02133 200.35484 207.1666 279.7900 10
是最快的通用解决方案。 dcast
merge
对于140E6行可能是最快的。
一般解决方案
Hopefulle最终编辑:
我根据最受限制的解决方案设计了2个通用解决方案,并在3个10E6大小的矢量上运行它们。
data.table
<{> merge_dt_gen <- function(...){
args <- as.character(substitute(list(...)))[-1]
dts <- args %>% lapply(.%>% data.table(ip=get(.),key="ip"))
all_ips <- data.table(ip = unique(c(...)),key="ip") # all_ips <- data.table(ip = unique(c(a,b)))
for(dt in dts){
all_ips <- merge(all_ips,dt,all.x = TRUE,by="ip")
}
all_ips %>%
as.data.frame %>%
set_rownames(.,`[`(.,,'ip')) %>%
select(-ip) %>%
setNames(args) %>%
replace(.,!is.na(.),1) %>%
replace(.,is.na(.),0)
}
d_cast_gen <- function(...){
args <- as.character(substitute(list(...)))[-1]
args %>%
lapply(.%>% data.frame(get(.)) %>% setNames(c("src","ip"))) %>%
do.call(rbind,.) %>%
transform(v=1) %>%
dcast(ip ~ src,value.var="v") %>%
replace(.,is.na(.),0) %>%
setNames(gsub("v","",colnames(.))) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip)
}
n <- 10E6
set.seed(1)
a <- sample(2*n,n)
b <- sample(2*n,n)
d <- sample(unique(a,b),n)
microbenchmark(
d_cast_gen = d_cast_gen(a,b,d),
merge_dt_gen = merge_dt_gen(a,b,d),
times = 1
)
# Unit: seconds
# expr min lq mean median uq max neval
# d_cast_gen 70.99771 70.99771 70.99771 70.99771 70.99771 70.99771 1
# merge_dt_gen 47.41809 47.41809 47.41809 47.41809 47.41809 47.41809 1
merge
是最快的
答案 1 :(得分:0)
dplyr解决方案:
df <- data.frame("IP" = unique(c(a,b)))
df2 <- df%>%mutate(a = ifelse(df$IP %in% a,1,0),b = ifelse(df$IP %in% b,1,0))
输出:
> df2
IP a b
1 192.168.0.1 1 0
2 192.168.0.2 1 0
3 192.168.0.3 1 0
4 192.168.0.4 1 0
5 192.168.0.5 1 1
6 192.168.0.6 1 1
7 192.168.0.7 1 1
8 192.168.0.8 1 1
9 192.168.0.9 1 1
10 192.168.0.10 1 1
11 192.168.1.1 0 1
12 192.168.1.2 0 1
13 192.168.1.3 0 1
14 192.168.1.4 0 1
答案 2 :(得分:0)
我们可以将&#39; a&#39;&#39; b&#39;转换为factor
,同时将levels
指定为unique
个元素&#39; a&#39;,&#39; b&#39;并获得频率
lvl <- unique(c(a,b))
mapply(table, list(a = factor(a, levels = lvl),b = factor(b, levels = lvl)))
# a b
#192.168.0.1 1 0
#192.168.0.2 1 0
#192.168.0.3 1 0
#192.168.0.4 1 0
#192.168.0.5 1 1
#192.168.0.6 1 1
#192.168.0.7 1 1
#192.168.0.8 1 1
#192.168.0.9 1 1
#192.168.0.10 1 1
#192.168.1.1 0 1
#192.168.1.2 0 1
#192.168.1.3 0 1
#192.168.1.4 0 1