我试图通过引用列名从R中的data.table中提取特定值
require(data.table)
# Create data.frame
cohort = c("cohort1", "cohort2", "cohort3")
year = c(2019, 2018, 2020)
item_2018 = c("alpha", "beta", "gamma")
item_2019 = c("banana", "apples", "oranges")
item_2020 = c("Tim", "Daniel","Simon")
desired_result = c("banana", "beta", "Simon") # the values in this column I want to programatically grab from the relevant column before
cohorts <- data.frame(cohort,year, item_2018, item_2019, item_2020, desired_result)
setDT(cohorts) # turn the data.frame into a data.table
setkey(cohorts, year) # setting the key for the data.table (not sure if this is necessary)
# CALCULATE NEW FIELD (attempts - not working)
# trying to populate new column "result_attempt_1" with : c("banana", "beta", "Simon")
cohorts[, result_attempt_1 := get(paste0("item_", year)), by = year] # this returns c("Simon", "Simon", "Simon") rather than c("banana", "beta", "Simon")
cohorts[, result_attempt_2 := .SD[, get(paste0("item_", year)), by = year]] # very wrong
cohorts[, result_attempt_3 := .SD[, get(paste0("item_", year)), by = get(paste0("item_", year))]] # very wrong
我希望“ desired_results”列中的值最终出现在“ result_attempt”列中。我得到的最接近的是针对每个记录/行重复的最后一个正确结果。
任何想法如何实现这一目标?非常感谢。
答案 0 :(得分:1)
一个选项是行/列索引
cohorts[, result := as.data.frame(.SD)[cbind(seq_len(.N),
match(year, sub("item_", "", names(.SD))))], .SDcols = 3:5]
cohorts
# cohort year item_2018 item_2019 item_2020 desired_result result
#1: cohort2 2018 beta apples Daniel beta beta
#2: cohort1 2019 alpha banana Tim banana banana
#3: cohort3 2020 gamma oranges Simon Simon Simon
答案 1 :(得分:1)
这是 a :
for (rw in seq_len(nrow(cohorts))) {
set(
cohorts,
i = rw,
j = "newcol",
value = cohorts[[paste0("item_", cohorts[["year"]][rw])]][rw]
)
}
> cohorts
cohort year item_2018 item_2019 item_2020 desired_result newcol
1: cohort2 2018 beta apples Daniel beta beta
2: cohort1 2019 alpha banana Tim banana banana
3: cohort3 2020 gamma oranges Simon Simon Simon
答案 2 :(得分:1)
另一个使用data.table::melt
的选项,匹配item_year然后查找并通过引用进行更新:
cohorts[
melt(cohorts, measure.vars=patterns("^item"), variable.factor=FALSE)[,
value[variable==paste0("item_", year)], by=.(cohort)],
on=.(cohort), desired_result := V1]
答案 3 :(得分:0)
非常感谢您的有用回复。
似乎我面临的基本问题是data.table中列的数据类型。
似乎“ item_xxxx”列被强制为“因素”而不是“字符”。
如果我们从头开始定义data.table(而不是通过data.frame阶段),那么我最初提供的代码确实可以正常工作。
require(data.table)
# Create data.table
cohort = c("cohort1", "cohort2", "cohort3")
year = c(2019, 2018, 2020)
item_2018 = c("alpha", "beta", "gamma")
item_2019 = c("banana", "apples", "oranges")
item_2020 = c("Tim", "Daniel","Simon")
desired_result = c("banana", "beta", "Simon") # the values in this column I want to programatically grab from the relevant column before
# create DATA.TABLE (not data.frame)
cohorts <- data.table(cohort,year, item_2018, item_2019, item_2020, desired_result)
str(cohorts)
# setDT(cohorts)
# trying to populate new column "result_attempt_1" with : c("banana", "beta", "Simon")
cohorts[, result_attempt_1 := get(paste0("item_", year)), by = year] # this now returns c("banana", "beta", "Simon"), as desired
cohorts
所以-在这里带回家消息-如果不起作用,请运行str(data_name)检查列数据类型。
感谢大家的帮助和支持。