我试图加入给定列中具有相等值的行的非空值。这是一个例子:
Date x y
2017-06-01 5 NA <- to merge
2017-06-01 NA 8 <- to merge
2017-05-02 55 33
我需要的是将以上内容转换为:
Date x y
2017-06-01 5 8 <- merged lines
2017-05-02 55 33
如何使用列表推导或lambda表达式来完成此操作?
答案 0 :(得分:1)
使用与使用in this answer非常相似的方法,我们可以使用 dplyr
包执行您想要的操作。请注意,我创建了一个与您的非常相似的示例数据集。
library(dplyr)
# generate sample data
dat <- data.frame(grp = c('a','a','b'),
x = c(5, NA, 55),
y = c(NA, 8, 33), stringsAsFactors = FALSE)
dat
# grp x y
# 1 a 5 NA
# 2 a NA 8
# 3 b 55 33
my_fun <- function(x) x[!is.na(x)]
dat %>%
group_by(grp) %>%
summarise_all(funs(my_fun))
# A tibble: 2 × 3
# grp x y
# <chr> <dbl> <dbl>
# 1 a 5 8
# 2 b 55 33
我生成了一些在组的列中具有多个非缺失值的数据。我们可以使用expand.grid
,unique
和complete.cases
来查找非缺失值的所有组合。请注意,这就是我认为您希望问题得到解决的方式,但如果没有更多细节,很难确定。
dat <- structure(list(grp = c("a", "a", "a", "b", "b"), x = c(5, NA,
3, 8, NA), y = c(8, 9, NA, NA, 3)), .Names = c("grp", "x", "y"
), row.names = c(NA, -5L), class = "data.frame")
# grp x y
# 1 a 5 8
# 2 a NA 9
# 3 a 3 NA
# 4 b 8 NA
# 5 b NA 3
do.call('rbind', by(dat, dat$grp, function(d){
new_d <- unique(expand.grid(d, stringsAsFactors = FALSE))
new_d[complete.cases(new_d), ]
}))
# grp x y
# a.1 a 5 8
# a.7 a 3 8
# a.10 a 5 9
# a.16 a 3 9
# b b 8 3
答案 1 :(得分:1)
只要你在每一行中都有相同的数据,并且只缺少其中几个,这应该可行。
# sample data
myData <- data.frame(date = c(Sys.Date(), Sys.Date(), Sys.Date(), Sys.Date()-2),
x = c(5, NA, 5, 55),
y = c(NA, 8, 8, 33), stringsAsFactors = FALSE)
#myData
# date x y
# 2017-12-13 5 NA
# 2017-12-13 NA 8
# 2017-12-13 5 8
# 2017-12-11 55 33
# merge the lines
myData <- aggregate(myData[-1], list(myData$date), FUN = mean, na.rm = TRUE)
#> myData
# Group.1 x y
# 2017-12-11 55 33
# 2017-12-13 5 8
答案 2 :(得分:0)
如果它不适合您的使用案例,请告诉我。我做了一些有趣的事情,并希望它是通过排除令牌来按行分组合并问题的一般解决方案,或者是由用户提供的函数指定的值绑定。如果在独占合并上没有剩余值,则可以给出替换值。应该成功处理NA值作为替换值和data.frame的一部分。
一切都是基础R
#' Token in data.frame?
#'
#' Looks through all elements of data.frame to see if token is there.
#' Will only look in columns type compatible with token
#'
#' @param token The token to look for
#' @param df, data.frame to look in
#' @return Scalar boolean
contains <- function(token, df) {
mask <- class(token) == lapply(df, class)
all(
unlist(
lapply(df[, mask], function(col) {
found <- token %in% col
if(any(found)) return(TRUE)
found
})
)
)
}
#' Matrix Mask
#'
#' Calculate a boolean matrix mask of where
#' token is in data.frame. Token match is TRUE
#' in matrix, everything else FALSE
#'
#' @param token, token to look for
#' @param df, data.frame to look in
#' @return matrix of boolean type
matrix.mask <- function(token, df){
as.matrix(
lapply(df, function(c){
if(is.na(token)) {
sapply(c, is.na) #NA need special handling
} else {
token == c
}
})
)
}
#' Is Equal
#'
#' Test for equality by value and accounts for special handling of NA
#' All value sin lhs and rhs need to be of same type. If one contains any NA, value is FALSE
#'
#' @param lhs symbol to test for value equality against rhs
#' @param rhs symbol to test for value equality agains lhs
#' @return boolean vector or scalar depending on lhs and rhs
is.equal <- function(lhs, rhs) {
if(all(is.na(lhs)) && all(is.na(rhs))) {
return(rep(TRUE, max(length(lhs), length(rhs))))
} else if (any(is.na(lhs)) || any(is.na(rhs)))
{
return(rep(FALSE, max(length(lhs), length(rhs))))
}
else
{
lhs == rhs
}
}
#' Merge rows grouped by index column with exclusion of token
#'
#' Merge rows in a data.frame by excluding token.
#' Rows are considered by grouping as given by factor 'ind'
#' If exclusion of token does not render the group of rows
#' at that particular column value to only have one value
#' left, the vector of values left in that column will be
#' transformed to scalar by means of supplied tie function.
#'
#' @param df data.frame to do row-wise merge in
#' @param ind name or index of column to apply grouping of rows against
#' @param token a value to exclude during merge of rows
#' @param tie a function that takes a vector and returns a scalar
#' @param replace value to substitute with if column becomes efter merge, ie the group
#' rows contained only token
#' @param return merged data.frame
merge.rows = function(df, ind, token, tie, replace) {
#check invariants
col.names <- names(df)
stopifnot(ind %in% col.names | ind <= length(col.names))
if(is.character(ind)) {ind <- which(ind %in% col.names)} #make sure ind is numeric
if(!(is.factor(df[, ind]))) {
ind.type <- lapply(df[ind], class)
df[ind] <- lapply(df[ind], as.factor)
}
ind.name <- col.names[ind]
#fast return
if(
! contains(token, df[, -ind])
) {
return(df)
}
#list for each group with values for each column
merged.outer <-
by(df, as.list(df[ind]), simplify = F, FUN = function(rows) {
#short circuit with same return type
if(nrow(rows) == 1) {return(as.list(rows))}
index.inner <- rows[1, ind]
rows.data <- rows[-ind] #calculate without index factor
rows.mask <- matrix.mask(token, rows.data)
#list with value for each column in index group
merged.inner <-
mapply(function(rs, mask){
keep <- rs[! mask]
if(length(keep) > 1){ #put back index
tie(keep) #tie if many values
} else {
keep
}
}, rows.data, rows.mask, SIMPLIFY = F, USE.NAMES = T)
should.replace <- lapply(merged.inner, is.equal, token)
merged.inner.replaced <-
mapply(function(val, should) {
if(should) {val <- replace} else {val}
}, merged.inner, should.replace)
c(as.list(rows[ind]), merged.inner.replaced)
})
#make df again
df.merged <- Reduce(rbind, lapply(merged.outer, as.data.frame))
df.merged
}
d1 = data.frame(Date = as.Date(c('2017-06-01', '2017-06-01', '2017-05-02'), format='%Y-%m-%d'),
X = c(5, NA, 55),
Y = c(NA, 8, 33))
a1 <- merge.rows(d1, "Date", NA)
# Date X Y
# 1 2017-05-02 55 33
# 2 2017-06-01 5 8
d2 = data.frame(Date = as.Date(c('2017-06-01', '2017-06-01', '2017-05-02'), format='%Y-%m-%d'),
X = c(5, 6, 55),
Y = c(6, 8, 33))
a2 <- merge.rows(d2, "Date", 6)
# Date X Y
# 1 2017-05-02 55 33
# 2 2017-06-01 5 8
#data from bouncyball for better comparison
dat <- structure(list(grp = c("a", "a", "a", "b", "b"),
x = c(5, NA, 3, 8, NA), y = c(8, 9, NA, NA, 3)),
.Names = c("grp", "x", "y"), row.names = c(NA, -5L), class = "data.frame")
#apply insert the mean if more than one value in grouped column
merge.rows(dat, "grp", NA, mean)
# grp x y
# 1 a 4 8.5
# 2 b 8 3.0
#apply insert the max if more than one value in grouped column
merge.rows(dat, "grp", NA, max)
# grp x y
# 1 a 5 9
# 2 b 8 3
dat2 <- structure(list(grp1 = rep(c("a", "b"), each=4),
grp2 = rep(c("c", "d"), times = 2, each=2),
x = c(5, NA, 3, 8, NA, 8, 3, 9), y = c(8, 9, NA, NA, 3, 6, 2, NA)),
.Names = c("grp1", "grp2", "x", "y"), row.names = c(NA, 8L), class = "data.frame")
#merge on group by two columns
merge.rows(dat2, c("grp1", "grp2"), NA, mean, 999)
# grp1 grp2 x y
# 1 a c 5.0 8.5
# 2 a c 5.0 8.5
# 3 b c 8.0 4.5
# 4 b c 8.0 4.5
# 5 a d 5.5 999.0
# 6 a d 5.5 999.0
# 7 b d 6.0 2.0
# 8 b d 6.0 2.0
答案 3 :(得分:0)
对于每个变量和组只有一个非NA值的给定样本数据,这可以通过简洁的&#34;单行&#34;来解决:
library(data.table)
# coerce to class data.table, group by column Date,
# for each of the other columns remove any NA values
setDT(DF)[, lapply(.SD, na.omit), Date]
Date x y
1: 2017-06-01 5 8
2: 2017-05-02 55 33
DF <- structure(list(Date = c("2017-06-01", "2017-06-01", "2017-05-02"
), x = c(5L, NA, 55L), y = c(NA, 8L, 33L)), .Names = c("Date",
"x", "y"), row.names = c(NA, -3L), class = "data.frame")