我想比较2个数据帧。一个数据帧具有400k观测值,另一个数据帧为100k。我想将2中较短的每个观察值与较大的4个中的每个观察值进行比较。换句话说,b中的第一次观察(较短的DF),a(较大的DF)中的前4个观察值,b中的第二个观察值和...中的第二个4个等等.Id喜欢计算数量这是一场比赛。
c = 0
x = 0
d = 1
e = 4
for (x in b) {
if(a[d:e,1] = x){
c+1
}
x=x+1
d=d+4
e=e+4
}
答案 0 :(得分:1)
我试图在下面解决你的问题,但这有点困难,因为你的问题有点模糊。查看有关如何撰写好问题HandlerMappingIntrospector的指南。
我希望这段代码可以帮助您走上正确的轨道!
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# create two data frames, with specified dimensions
set.seed(123)
a_large_df <- data.frame(sample_a = sample(1:100, 400, TRUE))
head(a_large_df)
#> sample_a
#> 1 29
#> 2 79
#> 3 41
#> 4 89
#> 5 95
#> 6 5
b_small_df <- data.frame(sample_b = sample(1:100, 100, TRUE))
head(b_small_df)
#> sample_b
#> 1 99
#> 2 14
#> 3 91
#> 4 58
#> 5 40
#> 6 45
# create a group index column every 4 rows
a_large_df <- a_large_df %>%
mutate(group_of_4_index = (seq(nrow(a_large_df))-1) %/%4)
head(a_large_df)
#> sample_a group_of_4_index
#> 1 29 0
#> 2 79 0
#> 3 41 0
#> 4 89 0
#> 5 95 1
#> 6 5 1
# create an index column every row starting from 0 to match above
b_small_df <- b_small_df %>%
mutate(group_of_4_index = seq(nrow(b_small_df))-1)
head(b_small_df)
#> sample_b group_of_4_index
#> 1 99 0
#> 2 14 1
#> 3 91 2
#> 4 58 3
#> 5 40 4
#> 6 45 5
# combine the two dataframes by the index
a_b_df <- left_join(a_large_df, b_small_df, by = "group_of_4_index")
head(a_b_df)
#> sample_a group_of_4_index sample_b
#> 1 29 0 99
#> 2 79 0 99
#> 3 41 0 99
#> 4 89 0 99
#> 5 95 1 14
#> 6 5 1 14
# check if the values of the samples match per group, and if so mark "yes"
a_b_df <- a_b_df %>%
group_by(group_of_4_index) %>%
mutate(match = if_else(sample_a %in% sample_b, "yes", "no"))
head(a_b_df)
#> # A tibble: 6 x 4
#> # Groups: group_of_4_index [2]
#> sample_a group_of_4_index sample_b match
#> <int> <dbl> <int> <chr>
#> 1 29 0 99 no
#> 2 79 0 99 no
#> 3 41 0 99 no
#> 4 89 0 99 no
#> 5 95 1 14 no
#> 6 5 1 14 no
table(a_b_df$match)
#>
#> no yes
#> 392 8