Machines = c('A', 'A', 'B', 'C', 'D', 'A', 'B')
Week = c('201651', '201635', '201714', '201516', '201438', '201704', '201651')
Manual.Code = c('M123', 'M467', 'M123', 'M846', 'M898', 'QQQQ' ,'M898')
DF1 = data.frame(Machines, Week, Manual)
Manual.Code = c('M123', 'M123', 'M123', 'CTTY', 'M467', 'M.99', 'M846', 'M898')
Version = as.numeric(c('2.0', '1.1', '1.0', '2.0', '1.6', '1.67', '3.2', '2.4'))
Release_week = (c('201711', '201638', '201612', '201424', '201552', '201345',
'201719', '201647'))
DF2 = data_frame(Manual.Code, Version, Release_week)
我已经使用R几个月了,但我仍然认为自己是初学者。我的下一个问题涉及在某个时间点使用手册来维护机器类型。 DF1是我的机器,DF2是所用手册的版本。
DF1:
Type Week Manual.Code
A 201651 M123
A 201635 M467
B 201714 M123
C 201516 M846
D 201438 M898
A 201704 QQQQ
B 201651 M898
DF2:
Manual.Code Version Release_week
M123 2.0 201711
M123 1.1 201638
M123 1.0 201612
CTTY 2.0 201424
M467 1.6 201552
M.99 1.67 201345
M846 3.2 201719
M898 2.4 201647
Desired output DF3:
Type Week Manual.Code Used.Version
A 201651 M123 1.1
A 201635 M467 1.6
B 201714 M123 2.0
C 201516 M846 NA
D 201438 M898 NA
A 201704 QQQQ NA
B 201651 M898 2.4
我已经看过其他线程了,我认为解决方案是滚动连接,使用不等连接的data.tables,或者来自fuzzyjoin包的模糊连接,但我还没有设法得到其中一个解决方案。合并这两个数据集的最佳方法是什么?
P.S。大小真实数据集:
DF1: 35000 x 43
DF2: 217000 x 14
答案 0 :(得分:3)
使用 dplyr , left_join ,然后按绝对值( abs )安排几周之间的差异,并获取第一个( slice ),最后将版本分配给NA,其中 Release_week 晚于周:
library(dplyr)
DF1 %>%
mutate(rn = row_number()) %>%
left_join(DF2, by = "Manual.Code") %>%
#if you have different column names use this:
#left_join(DF2, by = c("Manual" = "Manual.Code")) %>%
group_by(rn) %>%
arrange(rn, abs(Release_week - Week)) %>%
slice(1) %>%
ungroup() %>%
select(-rn) %>%
mutate(Version = ifelse(Release_week > Week, NA, Version))
# # A tibble: 7 x 5
# Type Week Manual.Code Version Release_week
# <chr> <int> <chr> <dbl> <int>
# 1 A 201651 M123 1.1 201638
# 2 A 201635 M467 1.6 201552
# 3 B 201714 M123 2 201711
# 4 C 201516 M846 NA 201719
# 5 D 201438 M898 NA 201647
# 6 A 201704 QQQQ NA NA
# 7 B 201651 M898 2.4 201647
示例输入数据:
DF1 <- read.table(text = "
Type Week Manual.Code
A 201651 M123
A 201635 M467
B 201714 M123
C 201516 M846
D 201438 M898
A 201704 QQQQ
B 201651 M898
", header = TRUE, stringsAsFactors = FALSE)
DF2 <- read.table(text = "
Manual.Code Version Release_week
M123 2.0 201711
M123 1.1 201638
M123 1.0 201612
CTTY 2.0 201424
M467 1.6 201552
M.99 1.67 201345
M846 3.2 201719
M898 2.4 201647
", header = TRUE, stringsAsFactors = FALSE)
答案 1 :(得分:1)
data.table
包允许滚动连接:
library(data.table)
setDT(DF2)[setDT(DF1), on = .(Manual.Code, Release_week = Week), roll = +Inf]
Manual.Code Version Release_week Machines 1: M123 1.1 201651 A 2: M467 1.6 201635 A 3: M123 2.0 201714 B 4: M846 NA 201516 C 5: M898 NA 201438 D 6: QQQQ NA 201704 A 7: M898 2.4 201651 B
请注意,只有整数,双重或字符列可以滚动连接。因此,请务必在调用stringsAsFactors = FALSE
时使用data.frame()
参数。
可以对结果进行微调,以符合OP的预期结果:
library(data.table)
result <- setDT(DF2)[setDT(DF1), on = .(Manual.Code, Release_week = Week), roll = +Inf]
setnames(result, c("Release_week", "Version"), c("Week", "Used.Version"))
setcolorder(result, names(DF1))
result
Machines Week Manual.Code Used.Version 1: A 201651 M123 1.1 2: A 201635 M467 1.6 3: B 201714 M123 2.0 4: C 201516 M846 NA 5: D 201438 M898 NA 6: A 201704 QQQQ NA 7: B 201651 M898 2.4
请注意,使用OP提供的数据但使用stringsAsFactors = FALSE
。
Machines = c('A', 'A', 'B', 'C', 'D', 'A', 'B')
Week = c('201651', '201635', '201714', '201516', '201438', '201704', '201651')
Manual.Code = c('M123', 'M467', 'M123', 'M846', 'M898', 'QQQQ' ,'M898')
DF1 = data.frame(Machines, Week, Manual.Code, stringsAsFactors = FALSE)
Manual.Code = c('M123', 'M123', 'M123', 'CTTY', 'M467', 'M.99', 'M846', 'M898')
Version = as.numeric(c('2.0', '1.1', '1.0', '2.0', '1.6', '1.67', '3.2', '2.4'))
Release_week = (c('201711', '201638', '201612', '201424', '201552', '201345',
'201719', '201647'))
DF2 = data.frame(Manual.Code, Version, Release_week, stringsAsFactors = FALSE)
答案 2 :(得分:0)
我想我找到了解决问题的方法。然而,当你接受前几周的时候,我会获得不同的输出。我所做的是用函数查找最接近的前一周,然后与该列一起进行LEFT JOIN和手册。
options(stringsAsFactors = FALSE)
df1 = data.frame(type = c("A","A","B","C","D","A","B"),
week = c(201651, 201635, 201714, 201516, 201438, 201704, 201651),
manual.code = c("M123", "M467", "M123", "M846", "M898", "QQQQ", "M898"))
df2 = data.frame(manual.code = c("M123","M123","M123","CTTY","M467","M.99","M846", "M898"),
version = c("2.0","1.1","1.0", "2.0", "1.6", "1.67", "3.2", "2.4"),
release_week = c(201711,201638,201612,201424,201652,201345,201719,201647))
df3 = df1
df3$closest_week = NA
get_clostest_week = function(manual, week){
manual_weeks = df2[which(df2$manual.code == manual & df2$release_week <= week),"release_week"]
if(length(manual_weeks)==0){
return(NA)
}
diff = abs(manual_weeks - week)
manual_weeks[which(diff == min(diff))]
}
for(i in seq_len(nrow(df3))){
df3$closest_week[i] = get_clostest_week(df3$manual.code[i], df3$week[i])
}
df3 = merge(df3, df2, by.x = c("manual.code","closest_week"),
by.y = c("manual.code","release_week"), all.x = TRUE)))
(由于对问题的误解,答案已被编辑。)