根据最接近的最低日期合并数据集

时间:2018-05-31 08:55:55

标签: r date merge

Machines = c('A', 'A', 'B', 'C', 'D', 'A', 'B')
Week = c('201651',  '201635', '201714', '201516', '201438', '201704', '201651')
Manual.Code = c('M123', 'M467', 'M123', 'M846', 'M898', 'QQQQ' ,'M898')
DF1 = data.frame(Machines, Week, Manual)

Manual.Code = c('M123', 'M123', 'M123', 'CTTY', 'M467', 'M.99', 'M846', 'M898') 
Version = as.numeric(c('2.0', '1.1', '1.0', '2.0', '1.6', '1.67', '3.2', '2.4'))
Release_week = (c('201711', '201638', '201612', '201424', '201552', '201345', 
'201719', '201647'))
DF2 = data_frame(Manual.Code, Version, Release_week)

我已经使用R几个月了,但我仍然认为自己是初学者。我的下一个问题涉及在某个时间点使用手册来维护机器类型。 DF1是我的机器,DF2是所用手册的版本。

DF1:

Type  Week    Manual.Code
A     201651  M123
A     201635  M467
B     201714  M123
C     201516  M846
D     201438  M898
A     201704  QQQQ
B     201651  M898

DF2: 

Manual.Code Version Release_week
M123        2.0     201711
M123        1.1     201638
M123        1.0     201612
CTTY        2.0     201424
M467        1.6     201552
M.99        1.67    201345
M846        3.2     201719 
M898        2.4     201647

Desired output DF3: 
Type  Week    Manual.Code  Used.Version
A     201651  M123         1.1 
A     201635  M467         1.6
B     201714  M123         2.0
C     201516  M846         NA
D     201438  M898         NA
A     201704  QQQQ         NA
B     201651  M898         2.4

我已经看过其他线程了,我认为解决方案是滚动连接,使用不等连接的data.tables,或者来自fuzzyjoin包的模糊连接,但我还没有设法得到其中一个解决方案。合并这两个数据集的最佳方法是什么?

P.S。大小真实数据集:

DF1: 35000 x 43
DF2: 217000 x 14

3 个答案:

答案 0 :(得分:3)

使用 dplyr left_join ,然后按绝对值( abs )安排几周之间的差异,并获取第一个( slice ),最后将版本分配给NA,其中 Release_week 晚于

library(dplyr)

DF1 %>% 
  mutate(rn = row_number()) %>% 
  left_join(DF2, by = "Manual.Code") %>% 
  #if you have different column names use this:
  #left_join(DF2, by = c("Manual" = "Manual.Code")) %>% 
  group_by(rn) %>% 
  arrange(rn, abs(Release_week - Week)) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(-rn) %>% 
  mutate(Version = ifelse(Release_week > Week, NA, Version))

# # A tibble: 7 x 5
#   Type    Week Manual.Code Version Release_week
#   <chr>  <int> <chr>         <dbl>        <int>
# 1 A     201651 M123            1.1       201638
# 2 A     201635 M467            1.6       201552
# 3 B     201714 M123            2         201711
# 4 C     201516 M846           NA         201719
# 5 D     201438 M898           NA         201647
# 6 A     201704 QQQQ           NA             NA
# 7 B     201651 M898            2.4       201647

示例输入数据:

DF1 <- read.table(text = "
Type  Week    Manual.Code
A     201651  M123
A     201635  M467
B     201714  M123
C     201516  M846
D     201438  M898
A     201704  QQQQ
B     201651  M898
", header = TRUE, stringsAsFactors = FALSE)

DF2 <- read.table(text = "
Manual.Code Version Release_week
M123        2.0     201711
M123        1.1     201638
M123        1.0     201612
CTTY        2.0     201424
M467        1.6     201552
M.99        1.67    201345
M846        3.2     201719 
M898        2.4     201647
", header = TRUE, stringsAsFactors = FALSE)

答案 1 :(得分:1)

data.table包允许滚动连接

library(data.table)
setDT(DF2)[setDT(DF1), on = .(Manual.Code, Release_week = Week), roll = +Inf]
   Manual.Code Version Release_week Machines
1:        M123     1.1       201651        A
2:        M467     1.6       201635        A
3:        M123     2.0       201714        B
4:        M846      NA       201516        C
5:        M898      NA       201438        D
6:        QQQQ      NA       201704        A
7:        M898     2.4       201651        B

请注意,只有整数,双重或字符列可以滚动连接。因此,请务必在调用stringsAsFactors = FALSE时使用data.frame()参数。

可以对结果进行微调,以符合OP的预期结果:

library(data.table)
result <- setDT(DF2)[setDT(DF1), on = .(Manual.Code, Release_week = Week), roll = +Inf]
setnames(result, c("Release_week", "Version"), c("Week", "Used.Version"))
setcolorder(result, names(DF1))
result
   Machines   Week Manual.Code Used.Version
1:        A 201651        M123          1.1
2:        A 201635        M467          1.6
3:        B 201714        M123          2.0
4:        C 201516        M846           NA
5:        D 201438        M898           NA
6:        A 201704        QQQQ           NA
7:        B 201651        M898          2.4

数据

请注意,使用OP提供的数据但使用stringsAsFactors = FALSE

Machines = c('A', 'A', 'B', 'C', 'D', 'A', 'B')
Week = c('201651',  '201635', '201714', '201516', '201438', '201704', '201651')
Manual.Code = c('M123', 'M467', 'M123', 'M846', 'M898', 'QQQQ' ,'M898')
DF1 = data.frame(Machines, Week, Manual.Code, stringsAsFactors = FALSE)

Manual.Code = c('M123', 'M123', 'M123', 'CTTY', 'M467', 'M.99', 'M846', 'M898') 
Version = as.numeric(c('2.0', '1.1', '1.0', '2.0', '1.6', '1.67', '3.2', '2.4'))
Release_week = (c('201711', '201638', '201612', '201424', '201552', '201345', 
'201719', '201647'))
DF2 = data.frame(Manual.Code, Version, Release_week, stringsAsFactors = FALSE)

答案 2 :(得分:0)

我想我找到了解决问题的方法。然而,当你接受前几周的时候,我会获得不同的输出。我所做的是用函数查找最接近的前一周,然后与该列一起进行LEFT JOIN和手册。

options(stringsAsFactors = FALSE)
df1 = data.frame(type = c("A","A","B","C","D","A","B"),
             week = c(201651, 201635, 201714, 201516, 201438, 201704, 201651),
             manual.code = c("M123", "M467", "M123", "M846", "M898", "QQQQ", "M898"))

df2 = data.frame(manual.code = c("M123","M123","M123","CTTY","M467","M.99","M846", "M898"),
             version = c("2.0","1.1","1.0", "2.0", "1.6", "1.67", "3.2", "2.4"),
             release_week = c(201711,201638,201612,201424,201652,201345,201719,201647))


df3 = df1
df3$closest_week = NA

get_clostest_week = function(manual, week){
  manual_weeks = df2[which(df2$manual.code == manual & df2$release_week <= week),"release_week"]
  if(length(manual_weeks)==0){
    return(NA)
  }
  diff = abs(manual_weeks - week)
  manual_weeks[which(diff == min(diff))]
}

for(i in seq_len(nrow(df3))){
  df3$closest_week[i] = get_clostest_week(df3$manual.code[i], df3$week[i])
}



df3 = merge(df3, df2, by.x = c("manual.code","closest_week"), 
        by.y = c("manual.code","release_week"), all.x = TRUE)))

(由于对问题的误解,答案已被编辑。)