我将以下数据框保存为.csv,这是我之前代码的输出。 Value1_Cum是每个SAMPLE的值1的累积和。它也可以在这里找到:https://dl.dropboxusercontent.com/u/16277659/SAMPLE_rs.csv
SAMPLE; ID; REFERENCE_YEAR; VALUE1; VALUE1_CUM
SAMPLE1; 112; 1956; 0; 0
SAMPLE1; 112; 1957; 200; 200
SAMPLE1; 112; 1958; NA; NA
SAMPLE1; 112; 1959; NA; NA
SAMPLE1; 112; 1960; NA; NA
SAMPLE1; 112; 1963; NA; NA
SAMPLE2; 137; 1878; 0; 0
SAMPLE2; 137; 1879; -7900; -7900
SAMPLE2; 137; 1880; NA; NA
SAMPLE2; 137; 1881; NA; NA
SAMPLE2; 137; 1882; NA; NA
SAMPLE2; 137; 1890; NA; NA
SAMPLE2; 137; 1891; NA; NA
SAMPLE2; 137; 1892; -4300; -12200
SAMPLE2; 137; 1893; NA; NA
SAMPLE2; 137; 1894; NA; NA
SAMPLE3; 136; 1971; 0; 0
SAMPLE3; 136; 1972; NA; NA
SAMPLE3; 136; 1973; NA; NA
SAMPLE3; 136; 1974; NA; NA
SAMPLE3; 136; 1975; NA; NA
SAMPLE3; 136; 1976; NA; NA
SAMPLE3; 136; 1980; NA; NA
SAMPLE4; 0; 2000; 0; 0
SAMPLE4; 0; 2001; NA; NA
SAMPLE4; 0; 2002; NA; NA
SAMPLE4; 0; 2003; NA; NA
SAMPLE4; 0; 2004; 500; 500
SAMPLE4; 0; 2005; NA; NA
SAMPLE4; 0; 2006; NA; NA
为了进一步处理数据,我需要删除整个样本的所有行,其中包含0的模式,然后是NA(在此示例中,SAMPLE3应该被删除)。如果值在整个SAMPLE的NA值之间(如SAMPLE2中那样),则应保留行。有谁知道我该怎么做?
SAMPLE; ID; REFERENCE_YEAR; VALUE1; VALUE1_CUM
SAMPLE1; 112; 1956; 0; 0
SAMPLE1; 112; 1957; 200; 200
SAMPLE1; 112; 1958; NA; NA
SAMPLE1; 112; 1959; NA; NA
SAMPLE1; 112; 1960; NA; NA
SAMPLE1; 112; 1963; NA; NA
SAMPLE2; 137; 1878; 0; 0
SAMPLE2; 137; 1879; -7900; -7900
SAMPLE2; 137; 1880; NA; NA
SAMPLE2; 137; 1881; NA; NA
SAMPLE2; 137; 1882; NA; NA
SAMPLE2; 137; 1890; NA; NA
SAMPLE2; 137; 1891; NA; NA
SAMPLE2; 137; 1892; -4300; -12200
SAMPLE2; 137; 1893; NA; NA
SAMPLE2; 137; 1894; NA; NA
SAMPLE4; 0; 2000; 0; 0
SAMPLE4; 0; 2001; NA; NA
SAMPLE4; 0; 2002; NA; NA
SAMPLE4; 0; 2003; NA; NA
SAMPLE4; 0; 2004; 500; 500
SAMPLE4; 0; 2005; NA; NA
SAMPLE4; 0; 2006; NA; NA
答案 0 :(得分:1)
这是一个data.table
解决方案。
您的数据集:
df <- structure(list(SAMPLE = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("SAMPLE1", "SAMPLE2",
"SAMPLE3", "SAMPLE4"), class = "factor"), ID = c(112L, 112L,
112L, 112L, 112L, 112L, 137L, 137L, 137L, 137L, 137L, 137L, 137L,
137L, 137L, 137L, 136L, 136L, 136L, 136L, 136L, 136L, 136L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), REFERENCE_YEAR = c(1956L, 1957L, 1958L,
1959L, 1960L, 1963L, 1878L, 1879L, 1880L, 1881L, 1882L, 1890L,
1891L, 1892L, 1893L, 1894L, 1971L, 1972L, 1973L, 1974L, 1975L,
1976L, 1980L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L
), VALUE1 = c(0, 200, NA, NA, NA, NA, 0, -7900, NA, NA, NA, NA,
NA, -4300, NA, NA, 0, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA,
500, NA, NA), VALUE1_CUM = structure(c(3L, 6L, 5L, 5L, 5L, 5L,
3L, 1L, 5L, 5L, 5L, 5L, 5L, 2L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 5L,
5L, 4L, 5L, 5L, 5L, 7L, 5L, 5L), .Label = c(" -7900", " -12200",
" 0", " 0", " NA", " 200", " 500"), class = "factor")), .Names = c("SAMPLE",
"ID", "REFERENCE_YEAR", "VALUE1", "VALUE1_CUM"), row.names = c(NA,
-30L), class = "data.frame")
代码
library(data.table)
as.data.table(df)[, .SD[!(VALUE1[1] == 0 & (all(is.na(VALUE1[-1]))))], by = SAMPLE]
结果
# SAMPLE ID REFERENCE_YEAR VALUE1 VALUE1_CUM
# 1: SAMPLE1 112 1956 0 0
# 2: SAMPLE1 112 1957 200 200
# 3: SAMPLE1 112 1958 NA NA
# 4: SAMPLE1 112 1959 NA NA
# 5: SAMPLE1 112 1960 NA NA
# 6: SAMPLE1 112 1963 NA NA
# 7: SAMPLE2 137 1878 0 0
# 8: SAMPLE2 137 1879 -7900 -7900
# 9: SAMPLE2 137 1880 NA NA
#10: SAMPLE2 137 1881 NA NA
#11: SAMPLE2 137 1882 NA NA
#12: SAMPLE2 137 1890 NA NA
#13: SAMPLE2 137 1891 NA NA
#14: SAMPLE2 137 1892 -4300 -12200
#15: SAMPLE2 137 1893 NA NA
#16: SAMPLE2 137 1894 NA NA
#17: SAMPLE4 0 2000 0 0
#18: SAMPLE4 0 2001 NA NA
#19: SAMPLE4 0 2002 NA NA
#20: SAMPLE4 0 2003 NA NA
#21: SAMPLE4 0 2004 500 500
#22: SAMPLE4 0 2005 NA NA
#23: SAMPLE4 0 2006 NA NA
#SAMPLE ID REFERENCE_YEAR VALUE1 VALUE1_CUM
答案 1 :(得分:0)
在观察以下输出后,您可以使用多种方法简单地对数据集进行子集化。我们首先在向量中的1:n-1位置找到0的位置,然后检查0位置+ 1是否为NA。 (我手动添加了一些;在表格标题中用于阅读目的)
by(data = df, INDICES = df$SAMPLE,
FUN = function(x) any(is.na(x$VALUE1[which((x$VALUE1)[1:(length(x$VALUE1)-1)] == 0)+1])))
df$SAMPLE: SAMPLE1
[1] FALSE
----------------------------------------------------------------------------
df$SAMPLE: SAMPLE2
[1] FALSE
----------------------------------------------------------------------------
df$SAMPLE: SAMPLE3
[1] TRUE
x = "SAMPLE; ID; REFERENCE_YEAR; VALUE1; VALUE1_CUM
SAMPLE1; 112; 1956; 0; 0
SAMPLE1; 112; 1957; 200; 200
SAMPLE1; 112; 1958; NA; NA
SAMPLE1; 112; 1959; NA; NA
SAMPLE1; 112; 1960; NA; NA
SAMPLE1; 112; 1963; NA; NA
SAMPLE2; 137; 1878; 0; 0
SAMPLE2; 137; 1879; -7900; -7900
SAMPLE2; 137; 1880; NA; NA
SAMPLE2; 137; 1881; NA; NA
SAMPLE2; 137; 1882; NA; NA
SAMPLE2; 137; 1890; NA; NA
SAMPLE2; 137; 1891; NA; NA
SAMPLE2; 137; 1892; -4300; -12200
SAMPLE2; 137; 1893; NA; NA
SAMPLE2; 137; 1894; NA; NA
SAMPLE3; 136; 1971; 0; -500
SAMPLE3; 136; 1972; NA; NA
SAMPLE3; 136; 1973; NA; NA
SAMPLE3; 136; 1974; NA; NA
SAMPLE3; 136; 1975; NA; NA
SAMPLE3; 136; 1976; NA; NA
SAMPLE3; 136; 1980; NA; NA"
df = read.table(text = x, sep = ";", header = TRUE, colClasses= c("character", rep("numeric", 4)))
使用样本4数据
编辑输出df$SAMPLE: SAMPLE1
[1] FALSE
-----------------------------------------------------------------------------------------------
df$SAMPLE: SAMPLE2
[1] FALSE
-----------------------------------------------------------------------------------------------
df$SAMPLE: SAMPLE3
[1] TRUE
-----------------------------------------------------------------------------------------------
df$SAMPLE: SAMPLE4
[1] TRUE
答案 2 :(得分:0)
以下是dplyr
解决方案,使用@David提供的dput()
library(dplyr)
df %>%
group_by(SAMPLE) %>%
filter(! (VALUE1[[1]] == 0 & all(is.na(VALUE1[-1])) ) )
获得
SAMPLE ID REFERENCE_YEAR VALUE1 VALUE1_CUM
1 SAMPLE1 112 1956 0 0
2 SAMPLE1 112 1957 200 200
3 SAMPLE1 112 1958 NA NA
4 SAMPLE1 112 1959 NA NA
5 SAMPLE1 112 1960 NA NA
6 SAMPLE1 112 1963 NA NA
7 SAMPLE2 137 1878 0 0
8 SAMPLE2 137 1879 -7900 -7900
9 SAMPLE2 137 1880 NA NA
10 SAMPLE2 137 1881 NA NA
11 SAMPLE2 137 1882 NA NA
12 SAMPLE2 137 1890 NA NA
13 SAMPLE2 137 1891 NA NA
14 SAMPLE2 137 1892 -4300 -12200
15 SAMPLE2 137 1893 NA NA
16 SAMPLE2 137 1894 NA NA
17 SAMPLE4 0 2000 0 0
18 SAMPLE4 0 2001 NA NA
19 SAMPLE4 0 2002 NA NA
20 SAMPLE4 0 2003 NA NA
21 SAMPLE4 0 2004 500 500
22 SAMPLE4 0 2005 NA NA
23 SAMPLE4 0 2006 NA NA