df <- structure(list(ID = c("1", "2", "3", "4", "5", "6"), `ID without mask` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), `Other Years` = c("2011", "2015", "2015", "2006, 2006, 2005, 2005, 2007",
"2014, 2011", "2007"), `Cut off Year` = c("2011", "2015", "2015", "2005",
"2011", "2007"), `2005` = c(NA, NA, NA, "30", "18", NA), `2006` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), `2007` = c("15", NA, "18", NA, "30, 18", NA), `2008` = c("16",
NA, NA, "30, 27", "18, 30", NA), `2009` = c("15", NA, NA, "20",
"30, 18", NA), `2010` = c(NA, NA, NA, "30, 20", NA, NA), `2011` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), `2012` = c(NA, NA, NA, "20, 30", NA, "26"), `2013` = c("15",
NA, "19", NA, NA, NA), `2014` = c(NA, NA, "18", NA, NA, NA),
`2015` = c(NA, NA, "18", NA, "18", NA), `2016` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_)), .Names = c("ID", "ID without mask",
"Other Years", "Cut off Year", "2005", "2006", "2007", "2008",
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016"
), row.names = c(NA, 6L), class = "data.frame")
给出上述数据框。我希望R连接并连续计算每个元素(年份),然后将其输出到新列中。
基于“截止年”列,我希望R将截止年之前的元素合并到一列中,并将截止年之后的元素合并在第二列中(包括截止年)。 / p>
因此对于截止日期为2011年的第一行,年份2007、2008和2009分别具有15、16、15,因此总数为3,因此R应在新列中输出数字3 。 2011年之后,只有2013年的元素带有条目,因此“之后”列将仅具有数字1。
诸如“ 30、27”之类的元素计为两个,等等。
这是所需的输出:
structure(list(ID = c("1", "2", "3", "4", "5", "6"), `ID without mask` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), `Other Years` = c("2011", "2015", "2015", "2006, 2006, 2005, 2005, 2007",
"2014, 2011", "2007"), `Cut off Year` = c("2011", "2015", "2015", "2005",
"2011", "2007"), `2005` = c(NA, NA, NA, "30", "18", NA), `2006` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), `2007` = c("15", NA, "18", NA, "30, 18", NA), `2008` = c("16",
NA, NA, "30, 27", "18, 30", NA), `2009` = c("15", NA, NA, "20",
"30, 18", NA), `2010` = c(NA, NA, NA, "30, 20", NA, NA), `2011` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), `2012` = c(NA, NA, NA, "20, 30", NA, "26"), `2013` = c("15",
NA, "19", NA, NA, NA), `2014` = c(NA, NA, "18", NA, NA, NA),
`2015` = c(NA, NA, "18", NA, "18", NA), `2016` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `Before cut` = c("3", "0", "3", "0", "7", "0"), `After cut` = c("1", "0", "1", "8", "1", "1")), .Names = c("ID", "Collab Years Patents",
"Collab Years Publications", "Cut off Year", "2005", "2006", "2007", "2008",
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "Before cut", "After cut"
), row.names = c(NA, 6L), class = "data.frame")
答案 0 :(得分:1)
我们将其转换为长格式,对值进行计数,然后重新加入原始格式:
library(tidyr)
library(dplyr)
library(stringr)
gather(df, key = "Year", value = "value", `2005`:`2016`) %>%
mutate(val_count = str_count(value, pattern = ",") + 1) %>%
group_by(ID) %>%
summarize(Before = sum(val_count * (Year < `Cut off Year`), na.rm = TRUE),
After = sum(val_count * (Year >= `Cut off Year`), na.rm = TRUE)) %>%
right_join(df) %>%
select(1:3)
# Joining, by = "ID"
# A tibble: 6 x 3
ID Before After
<chr> <dbl> <dbl>
1 1 3 1
2 2 0 0
3 3 3 1
4 4 0 8
5 5 7 1
6 6 0 1
我使用select(1:3)
仅显示结果的相关部分-省略该行以获取所有其他列。要获取值的数量,请在逗号的数量上加上1。