我在R中有一个数据框,我想在共享名称的行中对值进行求和,并为每个行记录假名。谁能想到一个有效的方法来做到这一点?我写了一个for循环(下面复制过),但是在大数据集上效率非常低。
提前致谢!
此数据集中给出了一些内容:(1)不超过2个站点将共享一个名称;(2)两个站点将始终是连续行中的下一个站点。我也对非R解决方案持开放态度,尽管我并不熟悉其他框架。
初始数据示例:
name, fake_name, value1, value2, value3
siteX, siteX, 4, 2, 0.5
siteX, siteX2, 1, 4, 0.2
siteY, siteY, 2, 1, 0.4
siteZ, siteZ, 8, 3, 0.2
转换为:
name, value1, value2, value3, fake_name, dup_fake_name
siteX, 5, 6, 0.7, siteX, siteX2
siteY, 2, 1, 0.4, siteY, NA
siteZ, 8, 3, 0.2, siteZ, NA
For-loop版本:
data[,c(1,3:5,2)] -> d2; d2$dup_fake_name <- NA
for (i in 1:(nrow(data)-1) {
if (data$name[i] == data$name[i+1]) {
data$value[i] <- data$value[i] + data$value[i+1]
data$dup_fake_name[i] <- data$fake_name[i+1]
}}
答案 0 :(得分:4)
如果您不关心转换数据中的最后两列,您可以aggregate
条目
df <- structure(list(
name = c("siteX", "siteX", "siteY", "siteZ"),
fake_name = c("siteX", "siteX2", "siteY", "siteZ"),
value1 = c(4L, 1L, 2L, 8L),
value2 = c(2L, 4L, 1L, 3L),
value3 = c(0.5, 0.2, 0.4, 0.2)),
.Names = c("name", "fake_name", "value1", "value2", "value3"),
class = "data.frame", row.names = c(NA, -4L))
df.agg <- aggregate(cbind(value1, value2, value3) ~ name, data = df, FUN = sum);
#name value1 value2 value3
#1 siteX 5 6 0.7
#2 siteY 2 1 0.4
#3 siteZ 8 3 0.2
添加ID有点多(丑陋)工作:
# Split based on df$name
ID <- lapply(split(df, df$name), function(x) x$fake_name)
# Pad with NA's to have the same number of columns
ID <- do.call(rbind.data.frame, lapply(ID, function(x) {
if (length(x) < max(sapply(ID, length))) {
x <- c(x, rep(NA, max(sapply(ID, length)) - length(x)))
}
return(x);
}))
colnames(ID) <- paste("fake_name_", 1:ncol(ID), sep = "");
# Add ID columns to df.agg
df.agg <- cbind.data.frame(df.agg, ID);
df.agg;
# name value1 value2 value3 fake_name_1 fake_name_2
#1 siteX 5 6 0.7 siteX siteX2
#2 siteY 2 1 0.4 siteY <NA>
#3 siteZ 8 3 0.2 siteZ <NA>
答案 1 :(得分:3)
以下是使用data.table
的选项。将'data.frame'转换为'data.table'(setDT(df1)
),按'name'分组,获取{value'列的sum
,dcast
'fake_name'到宽格式并加入on
'名称'列
library(data.table)
setDT(df1)[, lapply(.SD, sum), by = name, .SDcols = value1:value3
][dcast(df1, name~ paste0("fake_name", rowid(name)), value.var = 'fake_name'),
on = .(name)]
# name value1 value2 value3 fake_name1 fake_name2
#1: siteX 5 6 0.7 siteX siteX2
#2: siteY 2 1 0.4 siteY NA
#3: siteZ 8 3 0.2 siteZ NA
答案 2 :(得分:0)
如果需要,可以拆分library(tidyverse)
df <- data.frame(name = c('siteX', 'siteX', 'siteY', 'siteZ'),
fake_name = c('siteX', 'siteX2', 'siteY', 'siteZ'),
value1 = c(4,1,2,8),
value2 = c(2,4,1,3),
value3 = c(0.5,0.2,0.4,0.2))
df %>%
group_by(name) %>%
mutate(id = 1:n(),
dup_fake_name = ifelse(id==2, fake_name, NA)) %>%
summarise(value1 = sum(value1),
value2 = sum(value2),
value3 = sum(value3),
fake_name = toString(fake_name))
列,但这里有一个版本
compiler.plugin('done',callback)