返回每列R的第一个和最后一个非NA值的年份

时间:2020-05-19 18:55:57

标签: r dplyr

我有一个看起来像这样的数据框:

# A tibble: 9 x 5
# Groups:   group [3]
      group   year    value1  value2  value3
      <int>   <dbl>   <int>   <int>   <int>
1     1       2000    NA      3       4
2     1       2001    8       3       4
3     1       2002    4       3       NA
4     2       2000    NA      NA      1
5     2       2001    9       NA      1
6     2       2002    1       NA      NA
7     3       2000    NA      5       NA
8     3       2001    9       5       NA
9     3       2002    NA      5       NA

我需要一个脚本,该脚本返回每列的第一个和最后一个非na值的年份,而与组无关。理想情况下,输出应如下所示。注意实际的数据集会更大。

          start   end
value 1   2001    2002
value 2   2000    2002
value 3   2000    2001

3 个答案:

答案 0 :(得分:2)

我们可以重塑为'long'格式,然后按'name'和<?php require_once 'connection.php'; $fName = filter_input(INPUT_POST, "fName") ? filter_input(INPUT_POST, 'fName') : null; $lName = filter_input(INPUT_POST, "lName") ? filter_input(INPUT_POST, 'lName') : null; $email = filter_input(INPUT_POST, "email") ? filter_input(INPUT_POST, 'email') : null; $tempCheck1 = filter_input(INPUT_POST, "defaultCheck1") ? filter_input(INPUT_POST, 'defaultCheck1') : null; $tempCheck2 = filter_input(INPUT_POST, "defaultCheck2") ? filter_input(INPUT_POST, 'defaultCheck2') : null; $tempCheck3 = filter_input(INPUT_POST, "defaultCheck3") ? filter_input(INPUT_POST, 'defaultCheck3') : null; $tempCheck4 = filter_input(INPUT_POST, "defaultCheck4") ? filter_input(INPUT_POST, 'defaultCheck4') : null; $sqlInsert = "INSERT INTO dbo.form (fName, lName, email, category ) VALUES (:fName, :lName, :email, :defaultCheck1, :defaultCheck2, :defaultCheck3, :defaultCheck4) $stmt = $conn->prepare($sqlInsert); $stmt->bindParam(':fName', $fName); $stmt->bindParam(':lName', $lName); $stmt->bindParam(':email', $email); $stmt->bindParam(':defaultCheck1', $tempCheck1); $stmt->bindParam(':defaultCheck2', $tempCheck2); $stmt->bindParam(':defaultCheck3', $tempCheck3); $stmt->bindParam(':defaultCheck4', $tempCheck4); if ($stmt->execute()) { $conn->commit(); return true; } else { $conn->rollback(); return false; } ?> 进行分组以获得summarisemin'year'

max

或者使用library(dplyr) library(tidyr) library(tibble) df1 %>% select(-group) %>% pivot_longer(cols = starts_with('value'), values_drop_na = TRUE) %>% group_by(name) %>% summarise(start = min(year), end = max(year)) %>% column_to_rownames('name') # start end #value1 2001 2002 #value2 2000 2002 #value3 2000 2001 中的melt

data.table

或者我们也可以使用library(data.table) melt(setDT(df1), id.var = c('year', 'group'), na.rm = TRUE)[, .(start = min(year), end = max(year)), .(variable)]

summarise_at

数据

df1 %>%
    summarise_at(vars(starts_with('value')), ~ 
       list(range(year[!is.na(.)]))) %>% 
    unnest(everything()) %>% 
    pivot_longer(everything())

答案 1 :(得分:2)

一个基本的解决方案是,我们在NA中找不到yearvalue1列的value3

data.frame(t(sapply(paste0("value", 1:3), function(i){
val_i <- df1[ , i]

data.frame(start = 
df1$year[min(which(!is.na(val_i)))], end= 
df1$year[max(which(!is.na(val_i)))])
})))

答案 2 :(得分:1)

另一个data.table选项。不确定是否建议使用names(.SD),但扩展性应该很好

library(data.table)

setDT(df1)[, .(val = names(.SD),
               start = lapply(.SD, function(x) min(year[!is.na(x)])),
               end = lapply(.SD, function(x) max(year[!is.na(x)]))), .SDcols = startsWith(names(df1), "value")]

      val start  end
1: value1  2001 2002
2: value2  2000 2002
3: value3  2000 2001