Question

我想比较2个数据帧。一个数据帧具有400k观测值，另一个数据帧为100k。我想将2中较短的每个观察值与较大的4个中的每个观察值进行比较。换句话说，b中的第一次观察（较短的DF），a（较大的DF）中的前4个观察值，b中的第二个观察值和...中的第二个4个等等.Id喜欢计算数量这是一场比赛。

c = 0
x = 0
d = 1
e = 4

for (x in b) {
    if(a[d:e,1] = x){
        c+1
    }
    x=x+1
    d=d+4
    e=e+4
}

Answer 1

我试图在下面解决你的问题，但这有点困难，因为你的问题有点模糊。查看有关如何撰写好问题HandlerMappingIntrospector的指南。

我希望这段代码可以帮助您走上正确的轨道！

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

# create two data frames, with specified dimensions
set.seed(123)
a_large_df <- data.frame(sample_a = sample(1:100, 400, TRUE))
head(a_large_df)
#>   sample_a
#> 1       29
#> 2       79
#> 3       41
#> 4       89
#> 5       95
#> 6        5
b_small_df <- data.frame(sample_b = sample(1:100, 100, TRUE))
head(b_small_df)
#>   sample_b
#> 1       99
#> 2       14
#> 3       91
#> 4       58
#> 5       40
#> 6       45

# create a group index column every 4 rows
a_large_df <- a_large_df %>%
  mutate(group_of_4_index = (seq(nrow(a_large_df))-1) %/%4)
head(a_large_df)
#>   sample_a group_of_4_index
#> 1       29                0
#> 2       79                0
#> 3       41                0
#> 4       89                0
#> 5       95                1
#> 6        5                1

# create an index column every row starting from 0 to match above
b_small_df <- b_small_df %>%
  mutate(group_of_4_index = seq(nrow(b_small_df))-1)
head(b_small_df)
#>   sample_b group_of_4_index
#> 1       99                0
#> 2       14                1
#> 3       91                2
#> 4       58                3
#> 5       40                4
#> 6       45                5

# combine the two dataframes by the index
a_b_df <- left_join(a_large_df, b_small_df, by = "group_of_4_index")
head(a_b_df)
#>   sample_a group_of_4_index sample_b
#> 1       29                0       99
#> 2       79                0       99
#> 3       41                0       99
#> 4       89                0       99
#> 5       95                1       14
#> 6        5                1       14

# check if the values of the samples match per group, and if so mark "yes" 
a_b_df <- a_b_df %>%
  group_by(group_of_4_index) %>%
  mutate(match = if_else(sample_a %in% sample_b, "yes", "no"))
head(a_b_df)
#> # A tibble: 6 x 4
#> # Groups:   group_of_4_index [2]
#>   sample_a group_of_4_index sample_b match
#>      <int>            <dbl>    <int> <chr>
#> 1       29                0       99    no
#> 2       79                0       99    no
#> 3       41                0       99    no
#> 4       89                0       99    no
#> 5       95                1       14    no
#> 6        5                1       14    no

table(a_b_df$match)
#> 
#>  no yes 
#> 392   8

将每n个观测值与R

1 个答案: