我查看了下面发布的相关问题:“How can I calculate the slope of multiple subsets of a data frame more efficiently?”我的初学者状态不允许我直接评论该线程(不知道该怎么做),所以我在这里问:
如何使用dplyr& broom包解决方案避免数据集中的NA处理数据框中多个类别的斜率计算?以下是脚本和结果的示例?
示例数据:
DOY<-c(102,102,102,102,102,102,102,102,102,102,212,212,212,212,212,212, 212,212,212,212)
LOCATION <- c(1,1,1,1,1,2,2,2,2,2,1,1,1,1,1,3,3,3,3,3)
response <-c(NA,NA,NA,NA,NA,7,10,15,20,30,2,4,6,NA,8,10,15,20,30,NA)
ts <- c(0,10,20,30, 40,0,10,20,30,40,0,10,20,30,40,0,10,20,30,40)
test.data <- data.frame(cbind(DOY, LOCATION, response, ts))
library(dplyr)
library(broom)
test.data2 <- test.data %>% group_by(DOY) %>% do(tidy(lm(response ~ ts, data = .)))
test.data2 %>% filter(term == "ts")
一个条件工作的结果(因为没有NA的每行有足够的数据):
# A tibble: 2 x 6
# Groups: DOY [2]
# DOY term estimate std.error statistic p.value
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 102. ts 0.560 0.0721 7.77 0.00444
# 2 212. ts 0.278 0.247 1.13 0.303
但是如果将多个类别用于组,则不是:
test.dataX <- test.data %>% group_by(LOCATION, DOY) %>% do(tidy(lm(response ~ ts, data = .)))
错误结果:
# Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
# 0 (non-NA) cases
test.dataX %>% filter(term == "ts")
# Error in eval(lhs, parent, parent) : object 'test.dataX' not found
ATTEMPT 2:我在lm()中尝试过na.omit,但这也行不通:
test.dataX <- test.data %>% group_by(LOCATION, DOY) %>% do(tidy(lm(response ~ ts, data = ., na.action=na.omit)))
# Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
# 0 (non-NA) cases
理想的是我想要像这样(如果可能的话,还有R2一起 - 如何将其添加到上面的输出中)?
# DOY LOCATION slope R2
# 102 1 NA NA
# 102 2 0.560 0.953
# 212 1 0.149 0.966
# 212 3 0.650 0.966
########################
请建议。谢谢!
答案 0 :(得分:0)
如果我们想要返回NA,请使用possibly
library(tidyverse)
library(broom)
pos1 <- possibly(lm, otherwise = NULL)
prsq <- possibly(pull, otherwise = NA)
test.data %>%
group_by(DOY, LOCATION) %>%
nest %>%
mutate(model = map(data, ~ pos1(response~ ts, data = .x)),
slope = map_dbl(model, ~
.x %>%
tidy %>%
filter(term == 'ts') %>%
prsq(estimate)),
R2 = map_dbl(model, ~
.x %>%
glance %>%
prsq(r.squared))) %>%
select(-data, -model)
# A tibble: 4 x 4
# DOY LOCATION slope R2
# <dbl> <dbl> <dbl> <dbl>
#1 102 1 NA NA
#2 102 2 0.56 0.953
#3 212 1 0.149 0.966
#4 212 3 0.650 0.966