根据R中的循环内两个数据帧中的匹配值进行过滤

时间:2020-09-14 09:43:40

标签: r for-loop

我有两个数据帧,df1df2

df1 <- tribble(~Speaker, ~age, ~word, ~ID,
               "Alex", 10, "cat", "Alex_10",
               "Alex", 10, "dog", "Alex_10",
               "Alex", 10, "car", "Alex_10",
               "Alex", 11, "sheep", "Alex_11",
               "Alex", 11, "box", "Alex_11",
               "Alex", 11, "cup", "Alex_11",
               "Bob", 10, "cat",  "Bob_10",
               "Bob", 10, "dog",  "Bob_10",
               "Bob", 10, "car",  "Bob_10",
               "Bob", 11, "sheep",  "Bob_11",
               "Bob", 11, "box",  "Bob_11",
               "Bob", 11, "cup", "Bob_11")

df2 <- tribble(~Speaker, ~age, ~word1, ~word2, ~word_pair, ~ID, ~value,
               "Alex", 10, "cat", "dog", "cat_dog", "Alex_10", 23,
               "Alex", 10, "cat", "car", "car_cat", "Alex_10", 12,
               "Alex", 10, "dog", "cat", "cat_dog", "Alex_10", 23,
               "Alex", 10, "dog", "car", "car_dog", "Alex_10", 25,
               "Alex", 10, "car", "dog", "car_dog", "Alex_10", 25,
               "Alex", 10, "car", "cat", "car_cat", "Alex_10", 12,
               "Alex", 11, "box", "sheep", "box_sheep", "Alex_11", 56,
               "Alex", 11, "box", "cup", "box_cup", "Alex_11", 34, 
               "Alex", 11, "sheep", "box", "box_sheep", "Alex_11", 56,
               "Alex", 11, "sheep", "cup", "cup_sheep", "Alex_11", 21,
               "Alex", 11, "cup", "box", "box_cup", "Alex_11", 34,
               "Alex", 11, "cup", "sheep", "cup_sheep", "Alex_11", 21,
               "Bob", 10, "cat", "dog", "cat_dog", "Bob_10", 11,
               "Bob", 10, "cat", "car", "car_cat", "Bob_10", 87,
               "Bob", 10, "dog", "cat", "cat_dog", "Bob_10", 11,
               "Bob", 10, "dog", "car", "car_dog", "Bob_10", 45,
               "Bob", 10, "car", "cat", "car_cat", "Bob_10", 87,
               "Bob", 10, "car", "dog", "car_dog", "Bob_10", 45,
               "Bob", 11, "sheep", "box", "box_sheep", "Bob_11", 32,
               "Bob", 11, "sheep", "cup", "cup_sheep", "Bob_11", 24,
               "Bob", 11, "box", "cup", "box_cup", "Bob_11", 65,
               "Bob", 11, "box", "sheep", "box_sheep", "Bob_11", 32,
               "Bob", 11, "cup", "box", "box_cup", "Bob_11", 65,
               "Bob", 11, "cup", "sheep", "cup_sheep", "Bob_11", 24)

我想循环遍历df1,以便为ID的每个实例创建一个新的数据框(保存为列表),以提取每个时间点产生的每个单词。每个发言人,并将其与word_pair中的df2个值进行匹配。我创建了一个循环,如下所示:

 value_list <- vector("list", length(df1)) 

      for (i in unique(df1$ID)) {
       value_words <- df2 %>%                             
       group_by(Speaker, age) %>%
       filter(Speaker == df1$Speaker[which(df1$ID == i)] & 
              age == df1$age[which(df1$ID == i)]) %>%
       filter((word1 %in% df1$word |                 
               word2 %in% df1$word) &
               value <= 50) %>%                 
      distinct(word_pair, .keep_all = T)
      value_list[[i]] <- value_words 
        }

预期输出如下:

 value_list[[Alex_10]]

 # A tibble: 1 x 9
 # Groups:   Speaker, age [1]
   word1 word2 value Speaker age  word_pair ID
  <chr>  <chr> <dbl> <chr>  <dbl> <chr>     <chr>      
   cat    dog    23  Alex   10    cat_dog   Alex_10
   cat    car    12  Alex   10    car_cat   Alex_10
   car    dog    25  Alex   10    car_dog   Alex_10

因此每个单词与每个单词的每个组合仅列出一次

但是出现以下错误:

错误:filter()输入..1出现问题。 x输入..1的大小必须为4或1,而不是大小5。 i输入..1&...。 i错误发生在第1组中:说话者=“ Alex”,年龄=“ 10”。

显然,问题出在filter()上,但我不知道如何更改它并仍然获得所需的输出。

1 个答案:

答案 0 :(得分:0)

以下内容可以满足您对left_joinfilter(来自dplyr)和最终split(来自base)的要求:

library(dplyr)

df <- df1 %>% 
  left_join(df2, by = c("Speaker", "age", "ID")) %>% 
  filter(value <= 50,
         word == word1 | word == word2) %>% 
  group_by(ID) %>% 
  distinct(word_pair, .keep_all = T) %>% 
  ungroup()

split(df, df$ID)

哪个给

$Alex_10
# A tibble: 3 x 8
  Speaker   age word  ID      word1 word2 word_pair value
  <chr>   <dbl> <chr> <chr>   <chr> <chr> <chr>     <dbl>
1 Alex       10 cat   Alex_10 cat   dog   cat_dog      23
2 Alex       10 cat   Alex_10 cat   car   car_cat      12
3 Alex       10 dog   Alex_10 dog   car   car_dog      25

$Alex_11
# A tibble: 2 x 8
  Speaker   age word  ID      word1 word2 word_pair value
  <chr>   <dbl> <chr> <chr>   <chr> <chr> <chr>     <dbl>
1 Alex       11 sheep Alex_11 sheep cup   cup_sheep    21
2 Alex       11 box   Alex_11 box   cup   box_cup      34

$Bob_10
# A tibble: 2 x 8
  Speaker   age word  ID     word1 word2 word_pair value
  <chr>   <dbl> <chr> <chr>  <chr> <chr> <chr>     <dbl>
1 Bob        10 cat   Bob_10 cat   dog   cat_dog      11
2 Bob        10 dog   Bob_10 dog   car   car_dog      45

$Bob_11
# A tibble: 2 x 8
  Speaker   age word  ID     word1 word2 word_pair value
  <chr>   <dbl> <chr> <chr>  <chr> <chr> <chr>     <dbl>
1 Bob        11 sheep Bob_11 sheep box   box_sheep    32
2 Bob        11 sheep Bob_11 sheep cup   cup_sheep    24