Question

我有两个数据帧Table_1和Table_2，我需要向"index"添加一列Table_2，其中值1用于匹配Table_1和其他人为0。

基本上，我需要匹配表_1中的“ Pol”，“ CTY”，“ STATE”和“ CRP”列以及表_2中的“ STATE”，“ CTY”，“ CRP”和“ Pol_No”。

我更喜欢data.table方法。

 Table_1:

 Pol      Cty    Avg    STATE   CRP
 85010    23     1123    MO     11
 75022    23     1123    MO     11
 35014   143     450     MO     11
.
.

Table_2:

STATE   CTY    CRP   Pol_No   Plan   Price 
AL      1      11    150410   90     4563
AL      1      21    45023    90     5402
MO    143      11    85010    90     2522
.
.

所需的输出如下。

Table_2:

STATE   CTY    CRP   Pol_No   Plan   Price  Index
AL      1      11    150410   90     4563     0
AL      1      21    45023    90     5402     0
MO    143      11    85010    90     2522     1
.
.

我怎么能达到R。

感谢您的帮助。

谢谢。

Answer 1

我认为这是直接的多列联接：

library(dplyr)
t2 %>%
  left_join(transmute(t1, CTY=Cty, STATE, Index=1L), by=c("CTY", "STATE")) %>%
  mutate(Index = if_else(is.na(Index), 0L, Index))
#   STATE CTY CRP Pol_No Plan Price Index
# 1    AL   1  11 150410   90  4563     0
# 2    AL   1  21  45023   90  5402     0
# 3    MO 143  11  85010   90  2522     1

编辑

我一直在尝试学习data.table，以为我可以尝试一下。在我看来，有点笨拙，我敢肯定有一种方法可以简化它。

t1 <- setDT(t1); t2 <- setDT(t2)

为方便起见，将列名设置为相同（否则我不知道如何轻松实现）...一个是"Cty"，另一个是"CTY"。使它们相同。

colnames(t1)[2] <- "CTY"

现在，合并。

t1[,.(CTY,STATE,CRP,Index=1),][t2,on=c("CTY","STATE","CRP")]
#    CTY STATE CRP Index Pol_No Plan Price
# 1:   1    AL  11    NA 150410   90  4563
# 2:   1    AL  21    NA  45023   90  5402
# 3: 143    MO  11     1  85010   90  2522

注意：

第一个方括号仅选择三个连接列，并分配第四个Index；
实际联接在第二个括号中，第一个只是一个选择
通常data.table操作具有副作用，但不会像这样进行合并或选择，因此它无需修改基础结构即可返回它；为此，我们需要将其存储（也许回到t2中）

已结束...现在只需更新Index字段，因为它是数据共存的1或否则的NA。

t2 <- t1[,.(CTY,STATE,CRP,Index=1),][t2,on=c("CTY","STATE","CRP")]
t2[,Index := as.integer(!is.na(Index)),]
t2
#    CTY STATE CRP Index Pol_No Plan Price
# 1:   1    AL  11     0 150410   90  4563
# 2:   1    AL  21     0  45023   90  5402
# 3: 143    MO  11     1  85010   90  2522

数据：

t1 <- read.table(header=TRUE, stringsAsFactors=FALSE, text='
 Pol      Cty    Avg    STATE   CRP
 85010    23     1123    MO     11
 75022    23     1123    MO     11
 35014   143     450     MO     11')
t2 <- read.table(header=TRUE, stringsAsFactors=FALSE, text='
STATE   CTY    CRP   Pol_No   Plan   Price 
AL      1      11    150410   90     4563
AL      1      21    45023    90     5402
MO    143      11    85010    90     2522')

Answer 2

这不是一个很好的解决方案，但它适用于data.table。您需要sqldf适用于数据框和数据表。

library(data.table)
df1<-data.table(Pol=c(85010,75022,35014),Cty=c(23,23,143), Avg=c(1123,1123,450),STATE=c("MO","MO","MO"), CRP=c(11,11,11))

df2=data.table(STATE=c("AL","AL","MO"),CTY=c(1,1,143),CRP=c(11,21,11),Pol_No=c(150410,45023,85010),Plan=c(90,90,90),Price=c(4563,5402,2522))

library(sqldf)
#left join
df<-sqldf("select df2.STATE,df2.CTY,df2.CRP,df2.Pol_No,df2.Plan,df2.Price,df1.Pol from df2 left join df1 on df1.Pol=df2.Pol_No")
#create index
df$index<-ifelse(is.na(df$Pol),0,1)

#delete extra column
df$Pol<-NULL 

> df
  STATE CTY CRP Pol_No Plan Price index
1    AL   1  11 150410   90  4563     0
2    AL   1  21  45023   90  5402     0
3    MO 143  11  85010   90  2522     1

Answer 3

这是一个完全基于data.table的解决方案：

 merge(t1,t2,by.x='Pol', by.y='Pol_No', all.y=TRUE)[,c('STATE.y','CTY', 'Cty', 'CRP.y',   'Pol',   'Plan',   'Price')]
#-----
   STATE.y CTY Cty CRP.y    Pol Plan Price
1:      AL   1  NA    21  45023   90  5402
2:      MO 143  23    11  85010   90  2522
3:      AL   1  NA    11 150410   90  4563
#--------
t3 <- merge(t1,t2,by.x='Pol', by.y='Pol_No', all.y=TRUE)[ ,
          c('STATE.y','CTY', 'Cty', 'CRP.y', 'Pol', 'Plan',   'Price')] 
t3[ , index := as.numeric(!is.na(Cty))]
t3
#--------
   STATE.y CTY Cty CRP.y    Pol Plan Price index
1:      AL   1  NA    21  45023   90  5402     0
2:      MO 143  23    11  85010   90  2522     1
3:      AL   1  NA    11 150410   90  4563     0

要在merge(..之后立即获取列名，我首先看过：

merge(t1,t2,by.x='Pol', by.y='Pol_No', all.y=TRUE)
      Pol Cty  Avg STATE.x CRP.x STATE.y CTY CRP.y Plan Price
1:  45023  NA   NA    <NA>    NA      AL   1    21   90  5402
2:  85010  23 1123      MO    11      MO 143    11   90  2522
3: 150410  NA   NA    <NA>    NA      AL   1    11   90  4563

在R中使用两个数据帧建立索引的问题

3 个答案: