我的数据如下:
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 513 C A 0/1:23,12:35:99:262,0,691 19,10:-40.6,-28.8,-78.7:-11.9:6.0
2 3
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1095 G A 0/1:35,12:47:99:328,0,1157 30,11:-61.1,-63.4,-134.7:2.2:12.0
3 4
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1217 G A 0/1:22,12:34:99:314,0,730 20,10:-68.4,-54.2,-109.0:-14.2:6.0
4 5
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1219 A C 0/1:22,12:34:99:308,0,715 20,10:-69.9,-54.2,-107.7:-15.7:6.0
5 6
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1721 G C 0/1:15,6:21:99:141,0,464 7,5:-21.8,-18.5,-30.1:-3.3:4.0
6 8
BLOCK2 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 2171 G C 0/1:14,13:27:99:388,0,369 9,5:-35.3,-26.5,-46.7:-8.7:3.0
7 10
BLOCK3 1 0 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 3661 G A 0/1:148,55:203:99:1070,0,4008 107,39:-163.0,-160.9,-438.4:-2.1:33.0
8 11
BLOCK3 1 0 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 3700 C T 0/1:124,124:249:99:3271,0,3667 117,107:-510.2,-163.3,-565.9:-346.9:4.0
9 12
BLOCK3 1 0 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 3754 T C 0/1:140,107:248:99:2786,0,3946 133,101:-436.9,-85.9,-558.8:-351.0:2.0
10
我想要的只是一个R命令,它允许我计算第2列和第3列(0和1的列)的各种属性,并为每个块(column1)执行此操作。因此,例如,下面的BLOCK1中有4行,BLOCK2有一行,等等。我想知道的一个基本问题是:对于每个块,第2列中有多少个零,第3列中有多少个? / p>
有人可以帮忙吗?我尝试使用各种形式的aggregate(),但问题是FUN参数不允许我执行上述操作。或许它确实如此,但我无法弄明白......
答案 0 :(得分:2)
您可以使用aggregate
base R
aggregate(!df[,c("Col2", "Col3")], list(Col1=df[,"Col1"]), FUN=sum)
# Col1 Col2 Col3
#1 BLOCK1 5 0
#2 BLOCK2 1 0
#3 BLOCK3 0 3
或使用data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) sum(!x)), by=Col1]
# Col1 Col2 Col3
#1: BLOCK1 5 0
#2: BLOCK2 1 0
#3: BLOCK3 0 3
对于组合,也许你需要
as.data.frame.matrix(table(df[,1],as.character(interaction(df[,-1]))))
# 0.1 1.0
#BLOCK1 5 0
#BLOCK2 1 0
#BLOCK3 0 3
如果你想要keep only cases in which the blocks have col2=0 AND col3=1 OR col2=1 AND col3=0, for ALL entries of a given block
:
更改示例数据集以显示某些变化(在当前数据集中,条件将选择所有行)
df$Col3[4] <- 0
df$Col2[8]<-0
df$Col3[8]<-1
df[with(df, ave(Col2==0 & Col3==1|Col2==1 & Col3==0, Col1, FUN=all)),]
# Col1 Col2 Col3
#6 BLOCK2 0 1
#7 BLOCK3 1 0
#8 BLOCK3 0 1
#9 BLOCK3 1 0
df <- structure(list(Col1 = c("BLOCK1", "BLOCK1", "BLOCK1", "BLOCK1",
"BLOCK1", "BLOCK2", "BLOCK3", "BLOCK3", "BLOCK3"), Col2 = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), Col3 = c(1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L)), .Names = c("Col1", "Col2", "Col3"), class = "data.frame", row.names = c(NA,
-9L))
答案 1 :(得分:1)
使用dplyr
:
require(dplyr)
#dummy data
d <- read.table(text="
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 513 C A 0/1:23,12:35:99:262,0,691 19,10:-40.6,-28.8,-78.7:-11.9:6.0
2 3
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1095 G A 0/1:35,12:47:99:328,0,1157 30,11:-61.1,-63.4,-134.7:2.2:12.0
3 4
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1217 G A 0/1:22,12:34:99:314,0,730 20,10:-68.4,-54.2,-109.0:-14.2:6.0
4 5
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1219 A C 0/1:22,12:34:99:308,0,715 20,10:-69.9,-54.2,-107.7:-15.7:6.0
5 6
BLOCK1 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 1721 G C 0/1:15,6:21:99:141,0,464 7,5:-21.8,-18.5,-30.1:-3.3:4.0
6 8
BLOCK2 0 1 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 2171 G C 0/1:14,13:27:99:388,0,369 9,5:-35.3,-26.5,-46.7:-8.7:3.0
7 10
BLOCK3 1 0 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 3661 G A 0/1:148,55:203:99:1070,0,4008 107,39:-163.0,-160.9,-438.4:-2.1:33.0
8 11
BLOCK3 1 0 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 3700 C T 0/1:124,124:249:99:3271,0,3667 117,107:-510.2,-163.3,-565.9:-346.9:4.0
9 12
BLOCK3 1 0 Locus_540_Transcript_32_Length_8324_genewise_newlength_8215__CDS__3870__6491 3754 T C 0/1:140,107:248:99:2786,0,3946 133,101:-436.9,-85.9,-558.8:-351.0:2.0
10 ",fill = TRUE)
#keep only rows with BLOCK names and count zeros in column 2
d %>% filter(grepl("BLOCK",V1)) %>%
group_by(BLOCK=V1) %>%
summarise(ZeroCountInCol2=sum(V2==0))
# BLOCK ZeroCountInCol2
# 1 BLOCK1 5
# 2 BLOCK2 1
# 3 BLOCK3 0
答案 2 :(得分:0)
如果您的data.frame名称是dataframe:
sapply(unique(dataframe[,1]),function(block){list(nb0_col1=sum(dataframe[dataframe[,1]==block,2]==0,na.rm=T),nb0_col2=sum(dataframe[dataframe[,1]==block,3]==0,na.rm=T))})
答案 3 :(得分:0)
所以上面的解决方案并没有真正给出我想要的东西,但我想出了如何在bash中做到这一点。我想解析文件以保留块,对于给定的块,所有col2 = 0且所有col3 = 1或所有col2 = 1且所有col3 = 0。而且我也想要计算这些块的数量。以下命令有效:
sed&#39;:a; N; $!ba; s / \ nBLOCK / \ n \ nBLOCK / g; s / $ / \ n /&#39; input.file | awk&#39; BEGIN {fg = 0; NUM = 0;块=&#34;&#34 ;; type =&#34;&#34 ;;} {if(/ ^ $ /&amp;&amp; fg == 1){print block; NUM ++; FG = 0;块=&#34;&#34 ;; } else if(/ ^ $ /){block =&#34;&#34 ;;类型=&#34;&#34 ;; fg = 0;} else if(/ ^ BLOCK /){block = block&#34; \ n&#34; $ ;总++;如果(fg == 0){type = $ 2&#34; &#34; $ 3; fg = 1; block = block&#34; \ n&#34; $ 的; } else if(fg == 1&amp;&amp; type == $ 2&#34;&#34; $ 3){block = block&#34; \ n&#34; $ _;} else if(fg == 1&amp;&amp; type!= $ 2&#34;&#34; $ 3){fg = 2; block =&#34;&#34;}} END {print&#34; perfect:\ t&#34;,num,&#34; \ tTotal:\ t&#34;,total}&#39; &GT; output.file