我正在分析存储疾病模拟模型输出数据的大表(300 000 - 500 000行)。在模型中,景观上的动物感染其他动物。例如,在下图所示的例子中,动物 a1 感染景观中的每一只动物,感染从动物移动到动物,分支到链条和#34;感染。
在下面的示例中,我想采取存储有关每个动物的信息的表格(在我的示例中,表格为allanimals
),并切出有关动物的信息 d2
的感染链(我已经突出显示 d2
的绿色链条)所以我可以计算该感染链的平均栖息地价值。
虽然我的while循环工作,但是当表存储数十万行时,它像糖蜜一样慢,而且链有40-100个成员。
关于如何提高速度的任何想法?希望得到tidyverse
解决方案。我知道它看起来足够快"使用我的示例数据集,但我的数据确实很慢......
示意图:
以下样本数据的预期输出:
AnimalID InfectingAnimal habitat
1 d2 d1 1
2 d1 c3 1
3 c3 c2 3
4 c2 c1 2
5 c1 b3 3
6 b3 b2 6
7 b2 b1 5
8 b1 a2 4
9 a2 a1 2
10 a1 x 1
示例代码:
library(tidyverse)
# make some data
allanimals <- structure(list(AnimalID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8",
"b1", "b2", "b3", "b4", "b5", "c1", "c2", "c3", "c4", "d1", "d2", "e1", "e2",
"e3", "e4", "e5", "e6", "f1", "f2", "f3", "f4", "f5", "f6", "f7"),
InfectingAnimal = c("x", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a2", "b1",
"b2", "b3", "b4", "b3", "c1", "c2", "c3", "c3", "d1", "b1", "e1", "e2", "e3",
"e4", "e5", "e1", "f1", "f2", "f3", "f4", "f5", "f6"), habitat = c(1L, 2L, 1L,
2L, 2L, 1L, 3L, 2L, 4L, 5L, 6L, 1L, 2L, 3L, 2L, 3L, 2L, 1L, 1L, 2L, 5L, 4L,
1L, 1L, 1L, 1L, 4L, 5L, 4L, 5L, 4L, 3L)), .Names = c("AnimalID",
"InfectingAnimal", "habitat"), class = "data.frame", row.names = c(NA, -32L))
# check it out
head(allanimals)
# Start with animal I'm interested in - say, d2
Focal.Animal <- "d2"
# Make a 1-row data.frame with d2's information
Focal.Animal <- allanimals %>%
filter(AnimalID == Focal.Animal)
# This is the animal we start with
Focal.Animal
# Make a new data.frame to store our results of the while loop in
Chain <- Focal.Animal
# make a condition to help while loop
InfectingAnimalInTable <- TRUE
# time it
ptm <- proc.time()
# Run loop until you find an animal that isn't in the table, then stop
while(InfectingAnimalInTable == TRUE){
# Who is the next infecting animal?
NextAnimal <- Chain %>%
slice(n()) %>%
select(InfectingAnimal) %>%
unlist()
NextRow <- allanimals %>%
filter(AnimalID == NextAnimal)
# If there is an infecting animal in the table,
if (nrow(NextRow) > 0) {
# Add this to the Chain table
Chain[(nrow(Chain)+1),] <- NextRow
#Otherwise, if there is no infecting animal in the table,
# define the Infecting animal follows, this will stop the loop.
} else {InfectingAnimalInTable <- FALSE}
}
proc.time() - ptm
# did it work? Check out the Chain data.frame
Chain
答案 0 :(得分:2)
所以问题在于您的数据结构。您将需要一个存储感染者的向量(将who保持为整数):
allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))
infected <- rep(NA_integer_, length(allanimals_ID))
infected[match(allanimals$AnimalID, allanimals_ID)] <-
match(allanimals$InfectingAnimal, allanimals_ID)
path <- rep(NA_integer_, length(allanimals_ID))
curOne <- match("d2", allanimals_ID)
i <- 1
while (!is.na(nextOne <- infected[curOne])) {
path[i] <- curOne
i <- i + 1
curOne <- nextOne
}
allanimals[path[seq_len(i - 1)], ]
为了获得额外的性能提升,请使用Rcpp重新编码此循环:&#39;)
答案 1 :(得分:2)
您可以编写执行此操作的函数:
path= function(animals,dat){
.path=function(x,d=""){
k=match(x,dat[,1])
d = paste(d,do.call(paste,dat[k,]),sep="\n ")
ifelse(is.na(k),d,.path(dat[k,2],d))}
n = .path(animals)
regmatches(n,gregexpr("(?<=\\n)",n,perl = T)) = animals
tab = na.omit(read.table(text=n,col.names = c("grp",names(dat))))
split(tab[-1],tab$grp)# This is not necessary. You can decide to return the tab
}
path("d2",allanimals)
$`d2`
AnimalID InfectingAnimal habitat
1 d2 d1 1
2 d1 c3 1
3 c3 c2 3
4 c2 c1 2
5 c1 b3 3
6 b3 b2 6
7 b2 b1 5
8 b1 a2 4
9 a2 a1 2
10 a1 x 1
此功能还可以在4毫秒内为所有其他动物提供路径:
allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal)
path(allanimals_ID,allanimals)
$`a1`
AnimalID InfectingAnimal habitat
1 a1 x 1
$a2
AnimalID InfectingAnimal habitat
3 a2 a1 2
4 a1 x 1
$a3
AnimalID InfectingAnimal habitat
6 a3 a2 1
7 a2 a1 2
8 a1 x 1
$a4
AnimalID InfectingAnimal habitat
10 a4 a3 2
11 a3 a2 1
12 a2 a1 2
13 a1 x 1
$a5
AnimalID InfectingAnimal habitat
15 a5 a4 2
16 a4 a3 2
17 a3 a2 1
18 a2 a1 2
19 a1 x 1
$a6
AnimalID InfectingAnimal habitat
21 a6 a5 1
22 a5 a4 2
23 a4 a3 2
24 a3 a2 1
25 a2 a1 2
26 a1 x 1
$a7
AnimalID InfectingAnimal habitat
28 a7 a6 3
29 a6 a5 1
30 a5 a4 2
31 a4 a3 2
32 a3 a2 1
33 a2 a1 2
34 a1 x 1
$a8
AnimalID InfectingAnimal habitat
36 a8 a7 2
37 a7 a6 3
38 a6 a5 1
39 a5 a4 2
40 a4 a3 2
41 a3 a2 1
42 a2 a1 2
43 a1 x 1
$b1
AnimalID InfectingAnimal habitat
45 b1 a2 4
46 a2 a1 2
47 a1 x 1
$b2
AnimalID InfectingAnimal habitat
49 b2 b1 5
50 b1 a2 4
51 a2 a1 2
52 a1 x 1
$b3
AnimalID InfectingAnimal habitat
54 b3 b2 6
55 b2 b1 5
56 b1 a2 4
57 a2 a1 2
58 a1 x 1
$b4
AnimalID InfectingAnimal habitat
60 b4 b3 1
61 b3 b2 6
62 b2 b1 5
63 b1 a2 4
64 a2 a1 2
65 a1 x 1
$b5
AnimalID InfectingAnimal habitat
67 b5 b4 2
68 b4 b3 1
69 b3 b2 6
70 b2 b1 5
71 b1 a2 4
72 a2 a1 2
73 a1 x 1
$c1
AnimalID InfectingAnimal habitat
75 c1 b3 3
76 b3 b2 6
77 b2 b1 5
78 b1 a2 4
79 a2 a1 2
80 a1 x 1
$c2
AnimalID InfectingAnimal habitat
82 c2 c1 2
83 c1 b3 3
84 b3 b2 6
85 b2 b1 5
86 b1 a2 4
87 a2 a1 2
88 a1 x 1
$c3
AnimalID InfectingAnimal habitat
90 c3 c2 3
91 c2 c1 2
92 c1 b3 3
93 b3 b2 6
94 b2 b1 5
95 b1 a2 4
96 a2 a1 2
97 a1 x 1
$c4
AnimalID InfectingAnimal habitat
99 c4 c3 2
100 c3 c2 3
101 c2 c1 2
102 c1 b3 3
103 b3 b2 6
104 b2 b1 5
105 b1 a2 4
106 a2 a1 2
107 a1 x 1
$d1
AnimalID InfectingAnimal habitat
109 d1 c3 1
110 c3 c2 3
111 c2 c1 2
112 c1 b3 3
113 b3 b2 6
114 b2 b1 5
115 b1 a2 4
116 a2 a1 2
117 a1 x 1
$d2
AnimalID InfectingAnimal habitat
119 d2 d1 1
120 d1 c3 1
121 c3 c2 3
122 c2 c1 2
123 c1 b3 3
124 b3 b2 6
125 b2 b1 5
126 b1 a2 4
127 a2 a1 2
128 a1 x 1
$e1
AnimalID InfectingAnimal habitat
130 e1 b1 2
131 b1 a2 4
132 a2 a1 2
133 a1 x 1
$e2
AnimalID InfectingAnimal habitat
135 e2 e1 5
136 e1 b1 2
137 b1 a2 4
138 a2 a1 2
139 a1 x 1
$e3
AnimalID InfectingAnimal habitat
141 e3 e2 4
142 e2 e1 5
143 e1 b1 2
144 b1 a2 4
145 a2 a1 2
146 a1 x 1
$e4
AnimalID InfectingAnimal habitat
148 e4 e3 1
149 e3 e2 4
150 e2 e1 5
151 e1 b1 2
152 b1 a2 4
153 a2 a1 2
154 a1 x 1
$e5
AnimalID InfectingAnimal habitat
156 e5 e4 1
157 e4 e3 1
158 e3 e2 4
159 e2 e1 5
160 e1 b1 2
161 b1 a2 4
162 a2 a1 2
163 a1 x 1
$e6
AnimalID InfectingAnimal habitat
165 e6 e5 1
166 e5 e4 1
167 e4 e3 1
168 e3 e2 4
169 e2 e1 5
170 e1 b1 2
171 b1 a2 4
172 a2 a1 2
173 a1 x 1
$f1
AnimalID InfectingAnimal habitat
175 f1 e1 1
176 e1 b1 2
177 b1 a2 4
178 a2 a1 2
179 a1 x 1
$f2
AnimalID InfectingAnimal habitat
181 f2 f1 4
182 f1 e1 1
183 e1 b1 2
184 b1 a2 4
185 a2 a1 2
186 a1 x 1
$f3
AnimalID InfectingAnimal habitat
188 f3 f2 5
189 f2 f1 4
190 f1 e1 1
191 e1 b1 2
192 b1 a2 4
193 a2 a1 2
194 a1 x 1
$f4
AnimalID InfectingAnimal habitat
196 f4 f3 4
197 f3 f2 5
198 f2 f1 4
199 f1 e1 1
200 e1 b1 2
201 b1 a2 4
202 a2 a1 2
203 a1 x 1
$f5
AnimalID InfectingAnimal habitat
205 f5 f4 5
206 f4 f3 4
207 f3 f2 5
208 f2 f1 4
209 f1 e1 1
210 e1 b1 2
211 b1 a2 4
212 a2 a1 2
213 a1 x 1
$f6
AnimalID InfectingAnimal habitat
215 f6 f5 4
216 f5 f4 5
217 f4 f3 4
218 f3 f2 5
219 f2 f1 4
220 f1 e1 1
221 e1 b1 2
222 b1 a2 4
223 a2 a1 2
224 a1 x 1
$f7
AnimalID InfectingAnimal habitat
226 f7 f6 3
227 f6 f5 4
228 f5 f4 5
229 f4 f3 4
230 f3 f2 5
231 f2 f1 4
232 f1 e1 1
233 e1 b1 2
234 b1 a2 4
235 a2 a1 2
236 a1 x 1
$x
[1] AnimalID InfectingAnimal habitat
<0 rows> (or 0-length row.names)
在使用while
icrobenchmark m
while循环时将其与, this function is twice as fast as the
循环进行比较。
microbenchmark::microbenchmark(
path_= {path= function(animals,dat){
.path=function(x,d=""){
k=match(x,dat[,1])
d = paste(d,do.call(paste,dat[k,]),sep="\n ")
ifelse(is.na(k),d,.path(dat[k,2],d))}
n = .path(animals)
regmatches(n,gregexpr("(?<=\\n)",n,perl = T)) = animals
tab = na.omit(read.table(text=n,col.names = c("grp",names(dat))))
split(tab[-1],tab$grp)# This is not necessary. You can decide to return the tab
}
path("d2",allanimals)
},
answer_above= {allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))
infected <- rep(NA_integer_, length(allanimals_ID))
infected[match(allanimals$AnimalID, allanimals_ID)] <-
match(allanimals$InfectingAnimal, allanimals_ID)
path <- rep(NA_integer_, length(allanimals_ID))
curOne <- match("d2", allanimals_ID)
i <- 1
while (!is.na(nextOne <- infected[curOne])) {
path[i] <- curOne
i <- i + 1
curOne <- nextOne
}
allanimals[path[seq_len(i - 1)], ]}
)
Unit: milliseconds
expr min lq mean median uq max neval
path_ 1.347699 1.394348 1.606106 1.448677 1.526331 11.800467 100
answer_above 2.655575 2.734935 2.897814 2.800926 2.882846 6.433567 100