匹配两个不同大小的数据集中的列,并按条件删除行

时间:2018-08-31 20:50:02

标签: r

我有两个长度不同的数据集,如果它们具有相同的ID,我想比较一列中的值,并删除那些值较小的行。 例如,我具有下面的数据集_1和数据集_2,我想按case.id比较“时间”列中的值,并从数据集_2中删除那些值小于数据集_1中的值的行。

dataset_1 <-    case.id time
             1    xxx1    1
             2    xxx2    2
             3    xxx3    3
dataset_2 <-    case.id distance time
             1    xxx1      100  0.8
             2    xxx1       50  1.2
             3    xxx1       40  2.0
             4    xxx2       50  3.0
             5    xxx2       40  4.0
             6    xxx3      100  2.5
             7    xxx3       50  3.0
             8    xxx3      100  3.5
             9    xxx3       50  5.0

我的预期结果应该是这样

new_dataset_2  <-   case.id distance time
                  1    xxx1       50  1.2
                  2    xxx1       40  2.0
                  3    xxx2       50  3.0
                  4    xxx2       40  4.0
                  5    xxx3       50  3.0
                  6    xxx3      100  3.5
                  7    xxx3       50  5.0

数据

dataset_1 <- structure(list(case.id = c("xxx1", "xxx2", "xxx3"), time = 1:3), .Names = c("case.id", 
"time"), class = "data.frame", row.names = c("1", "2", "3"))

dataset_2 <- structure(list(case.id = c("xxx1", "xxx1", "xxx1", "xxx2", "xxx2", 
"xxx3", "xxx3", "xxx3", "xxx3"), distance = c(100L, 50L, 40L, 
50L, 40L, 100L, 50L, 100L, 50L), time = c(0.8, 1.2, 2, 3, 4, 
2.5, 3, 3.5, 5)), .Names = c("case.id", "distance", "time"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9"))

2 个答案:

答案 0 :(得分:1)

您可以merge两个数据框,然后根据您的条件将其作为子集。

df_out <- merge(dataset_2, dataset_1, by = "case.id")
idx <- with(df_out, time.x >= time.y) # creates a logical vector we use for subsetting

df_out <- df_out[idx, c('case.id', 'distance', 'time.x')] # subset and filter
df_out <- setNames(df_out, names(dataset_2)) # rename columns
df_out
#  case.id distance time
#2    xxx1       50  1.2
#3    xxx1       40  2.0
#4    xxx2       50  3.0
#5    xxx2       40  4.0
#7    xxx3       50  3.0
#8    xxx3      100  3.5
#9    xxx3       50  5.0

答案 1 :(得分:0)

Merge始终是一个不错的选择。另一个选项是match,可用于逻辑子集。

dataset_2[dataset_2$time >= dataset_1$time[match(dataset_2$case.id, dataset_1$case.id)], ]

  case.id distance time
2    xxx1       50  1.2
3    xxx1       40  2.0
4    xxx2       50  3.0
5    xxx2       40  4.0
7    xxx3       50  3.0
8    xxx3      100  3.5
9    xxx3       50  5.0

要分解正在发生的事情:

match(dataset_2$case.id, dataset_1$case.id) #finds the places where case.id in dataset_2  matches case.id in dataset_1
[1] 1 1 1 2 2 3 3 3 3

dataset_1$time[match(dataset_2$case.id, dataset_1$case.id)] #since position and time are the same number, it's hard to tell, but this creates a vector of time based on those positions, another example will follow with different numbers
[1] 1 1 1 2 2 3 3 3 3

dataset_2$time >= dataset_1$time[match(dataset_2$case.id, dataset_1$case.id)] #compares the times and creates a logical
[1] FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE

最后使用该逻辑子集dataset_2

这是一个使用不同值更好地说明的示例。

dataset_3 <- data.frame(case.id = c("xxx1", "xxx2", "xxx3"), time = c(16,0.2,2.15))

dataset_4 <- data.frame(case.id = c("xxx1","xxx1","xxx1","xxx2","xxx2","xxx2","xxx3","xxx3","xxx3"),
                       distance = c(100,50,40,50,40,100,50,100,50), time = c(17,0.1,1.2,17,4.0,2.5,1.3,8,0.1))

dataset_4[dataset_4$time >= dataset_3$time[match(dataset_4$case.id, dataset_3$case.id)],]

  case.id distance time
1    xxx1      100 17.0
4    xxx2       50 17.0
5    xxx2       40  4.0
6    xxx2      100  2.5
8    xxx3      100  8.0