使用一个Dataframe列的元素选择另一个Dataframe中的值以使用Tidyverse创建第三个数据帧

时间:2018-01-01 21:38:44

标签: r tidyverse

library(tidyverse)

下面提供的代码会创建三个数据框 - Main,LookUp和Final。我正在尝试使用Main和LookUp数据帧来创建Final数据帧。

例如,Final表仅保留具有LookUp表的Section_Lookup中提供的数字的“Sections”,同时还保留相应的“Title”变量。

我想尽可能多地使用tidyverse。我的大多数尝试都遵循下面的代码。我认为使用两个循环或purrr将允许我循环遍历Main和LookUp表。这比我通常尝试的更先进,所以我想在如何继续和处理这种情况方面提供一些帮助。

New<-map(Main, function(x) {
map(LookUp, function(y) if_else(x$Title1==y$Title_Lookup & ...x$Section1 CONTAINS Y SECTION_LOOKUP... ) )}),

示例代码如下:

主数据框:

    Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B", "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A", "1B", "2A", "2A")
Main<-data_frame(Title1,Section1)

LookUp表:

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-c(2, 2, 4, 2, 2, 6, 2)
LookUp<-data_frame(Title_Lookup,Section_Lookup)

最终数据框:

Section_Final<-c("2A", "2B", "2C", "2A", "4A", "4B", "4C", "4D", "2A", "2B", "2A", "6A", "2A")
Title_Final<-c("101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "200A", "200A", "250D")
Final<-data_frame(Title_Final,Section_Final)

3 个答案:

答案 0 :(得分:1)

使用的解决方案。 str_replace函数来自,它是的一部分。如果您只想加载包,则可以使用sub("\\D+$", "", Section1)代替str_replace

library(tidyverse)
Main2 <- Main %>%
  mutate(Number = as.numeric(str_replace(Section1, "\\D+$", ""))) %>%
  semi_join(LookUp, by = c("Title1" = "Title_Lookup",
                           "Number" = "Section_Lookup")) %>%
  select(Title_Final = Title1,  Section_Final = Section1) %>%
  distinct() 
Main2
# # A tibble: 13 x 2
#    Title_Final Section_Final
#    <chr>       <chr>        
#  1 101A        2A           
#  2 101A        2B           
#  3 101A        2C           
#  4 203S        2A           
#  5 203S        4A           
#  6 203S        4B           
#  7 203S        4C           
#  8 203S        4D           
#  9 400B        2A           
# 10 400B        2B           
# 11 200A        2A           
# 12 200A        6A           
# 13 250D        2A  

答案 1 :(得分:0)

这是一个基于sqldf包的解决方案,利用charindex()查看SectionLookup中的字符串是否出现在Section1中。

library(tidyverse)
Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B", "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A", "1B", "2A", "2A")
Main<-data_frame(Title1,Section1)

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-as.character(c(2, 2, 4, 2, 2, 6, 2))
LookUp<-data_frame(Title_Lookup,Section_Lookup)

sqlQuery <- "select distinct a.Title1 as Title, a.Section1 as Section 
                    from Main as a
             left join LookUp as b 
             where 
             a.Title1 = b.Title_Lookup and
             charindex(b.Section_Lookup,a.Section1) > 0"
sqldf(sqlQuery)

...和输出。

> sqldf(sqlQuery)
   Title Section
1   101A      2A
2   101A      2B
3   101A      2C
4   203S      2A
5   203S      4A
6   203S      4B
7   203S      4C
8   203S      4D
9   400B      2A
10  400B      2B
11  200A      2A
12  200A      6A
13  250D      2A
>

答案 2 :(得分:0)

另一种方法可能只基于Section列。

library(dplyr)
Name1<-c("Name1", "Name2", "Name3", "Name4", "Name5", "Name6", "Name7", "Name8", "Name9",
         "Name10", "Name11", "Name12", "Name13", "Name14", "Name15", "Name16", "Name17",
         "Name18", "Name19", "Name20", "Name21", "Name22", "Name23", "Name24", "Name25",
         "Name26", "Name27", "Name28", "Name29")
Code<-c(10123, 13432, 34554, 45563, 43666, 54444, 55322, 52111, 33443, 88998, 54554,
        33455, 65889, 88888, 22344, 54455, 66655, 22222, 65564, 77677, 65545, 67765,
        34334, 88789, 76776, 67765, 55555, 65445, 65665)
Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S",
          "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B",
          "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B",
            "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A",
            "1B", "2A", "2A")
Main<-data_frame(Name1,Code,Title1,Section1)

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-c(2, 2, 4, 2, 2, 6, 2)
LookUp<-data_frame(Title_Lookup,Section_Lookup)

#create data.frame of distinct Sections
df_sections <- distinct(LookUp, Section_Lookup) %>% as.data.frame()

#Use filter to select those records having matching numeric value in Section
filter(Main, as.numeric(gsub("([0-9]).*","\\1",Section1)) %in% df$Section_Lookup) %>%
  select(Title1, Section1) %>% distinct()

#The result:
# A tibble: 13 x 2
   Title1 Section1
    <chr>    <chr>
 1   101A       2A
 2   101A       2B
 3   101A       2C
 4   203S       2A
 5   203S       4A
 6   203S       4B
 7   203S       4C
 8   203S       4D
 9   400B       2A
10   400B       2B
11   200A       2A
12   200A       6A
13   250D       2A