我需要对来自计划的一些生态样本的α多样性(样本内多样性)计算的输出表进行一些分析。
由于原始数据表太大,我只在这里显示部分数据,分别对应于下面脚本中提到的chao1和par数据帧:
list(sequence.depth = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 424, 424, 424,
424, 424, 424, 424, 424, 424, 424), iteration = c(0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9), sample1 = c(13, 55, 55, 23, 13, 55, 23, 29, 55,
23, 623.5, 440, 545.7142857, 353, 515.4615385, 767.8, 602.2,
354.3333333, 349.25, 508.7692308, 782.7142857, 648.3571429, 555.0344828,
662.4166667, 651.037037, 491.0285714, 621.2857143, 600.7, 536.5714286,
683.5517241), sample2 = c(23, 23, 55, 55, 55, 16, 23, 55, 13,
55, 603, 408.5, 422.0769231, 328.4, 304.5, 379.2142857, 375,
469, 483.1111111, 384.75, 807.8333333, 793, 545.75, 542.8, 599.1153846,
555.1363636, 688, 655.5652174, 491.2758621, 673.1363636), sample3 = c(55,
55, 55, 55, 55, 55, 55, 55, 55, 55, 942.6470588, 763.1818182,
757.7142857, 817.5652174, 951.1176471, 622.3846154, 1065.75,
1436.1, 771.4545455, 1240.428571, 1091.622222, 1305.571429, 1029.6,
1349, 1282.357143, 1120.139535, 1362.153846, 1324.447368, 1151.923077,
990.3958333), sample4 = c(55, 55, 55, 23, 55, 55, 55, 23, 55,
55, 1134.4, 1245.2, 1233.4, 1112.066667, 680.2916667, 910.25,
1216.071429, 1168.5, 853.4736842, 1180.5625, 1142.130435, 1438.157895,
1074.222222, 1223.976744, 1062.163265, 1346.25, 1245.434783,
1330.073171, 1134.022222, 1395.833333), sample5 = c(55, 55, 55,
55, 55, 55, 29, 55, 55, 55, 711.55, 713.8947368, 656.3157895,
695.8947368, 679.1052632, 880.0588235, 812.5555556, 959.5714286,
551.4347826, 619.6363636, 895.5227273, 876.9772727, 884.2790698,
895.8139535, 840.7659574, 882.1363636, 922.0238095, 894.5227273,
859, 895.8139535), sample6 = c(55, 55, 55, 55, 55, 55, 55, 55,
55, 55, 508.6071429, 852.0588235, 867, 773.0526316, 786.5, 1064.2,
699.6363636, 892.1764706, 702.25, 948.8333333, 1139.4, 892.5294118,
1331.684211, 1068, 1297.676471, 1024.27907, 1096.142857, 1148.227273,
1031.046512, 935.5957447), sample7 = c(23, 13, 9, 29, 55, 8,
29, 22, 8.5, 7, 241.6, 576.5, 140.3333333, 412.5, 370, 192.5454545,
123.1111111, 263, 164.3333333, 223.5, 326.5, 442.3333333, 297.6875,
448.2727273, 345.5, 640.3333333, 439.1, 359.6, 356.0714286, 292.8333333
), sample8 = c(22, 23, 23, 29, 23, 23, 55, 23, 29, 23, 582.0909091,
333.5263158, 657, 337.8666667, 372.0714286, 470.5, 594.3, 553.0909091,
368.55, 544.8, 689.3571429, 626.24, 941.5, 649.9642857, 651.5,
632.4347826, 653.7777778, 767.7727273, 674.3913043, 713.7727273
), sample9 = c(55, 23, 23, 55, 23, 23, 55, 55, 55, 55, 368, 445.1666667,
669.0909091, 339.6521739, 335.3, 635.4615385, 382.1578947, 538.6666667,
425.3684211, 459.3125, 583.3636364, 624.1935484, 590.0909091,
600, 731.2222222, 677.2758621, 619.1818182, 592, 747.7307692,
619.5), sample10 = c(55, 23, 55, 55, 55, 23, 55, 13, 23, 55,
1184.153846, 820.5, 971.0769231, 793.6470588, 1051.4, 811.5,
540.64, 705.5555556, 880.0588235, 906.0666667, 1263.6, 956.2380952,
1036.108108, 1013.029412, 1228.794118, 1005.263158, 937.4186047,
1104.333333, 898.1162791, 1071.135135), sample11 = c(23, 23,
55, 55, 23, 55, 55, 55, 55, 55, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
structure(list(plate = structure(c(2L, 2L, 5L, 5L, 5L, 5L, 4L,
2L, 3L, 1L, 1L), .Label = c("Plate1", "Plate2", "Plate3", "Plate4",
"Plate5"), class = "factor"), sequence_run = structure(c(8L,
10L, 6L, 5L, 4L, 3L, 2L, 9L, 11L, 1L, 7L), .Label = c("Run1",
"Run10", "Run11", "Run12", "Run13", "Run14", "Run2", "Run4",
"Run5", "Run6", "Run7"), class = "factor"), type = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("flower",
"leaf"), class = "factor"), environment = structure(c(1L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("field1", "field2",
"field3"), class = "factor"), sample = structure(c(1L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 2L, 3L), .Label = c("sample1", "sample10",
"sample11", "sample2", "sample3", "sample4", "sample5", "sample6",
"sample7", "sample8", "sample9"), class = "factor")), .Names = c("plate",
"sequence_run", "type", "environment", "sample"), class = "data.frame", row.names = c(NA,
-11L))
我想要做的分析是将alpha多样性的变化划分为变量plate,sequence_run,sample_type和environment,我计划在R中使用基于排列的多元回归。
我原来的问题:我无法想象我应该如何为R准备这张桌子以便我可以进行分析。所以我的问题是,我应该如何组织这个输出表以便在R中进行分析?有没有办法在'aovp'函数中处理NA?
请注意,在第N个序列深度处从0到9的迭代是由于来自该样本的N个序列的随机采样,以由程序计算10次α分集。如果该样本中没有足够的序列,则程序生成“NA”。
感谢您的帮助!
更新:谢谢,格雷。当我第一次提出这个问题时,我没有任何线索。我已经尝试了这里显示的脚本将其组织成一个非常长的数据框,每行代表每个迭代和序列深度的每个样本的alpha多样性度量,以及其他四个参数(sequence_run,plate,type和environment) 。但我不能做它,因为它太大了。所以我怀疑这是否是为我提议的分析组织数据的正确方法:
library(lmPerm)
library(reshape2)
library(plyr)
chao1=read.table(file.choose(),head=T,sep='\t')
par=read.table(file.choose(),head=T)
#make a new column in par so that par$all contains
#all the variable names and sample names
par$all=paste0(par$plate,'x',par$sequence_run,'x',par$type,'x',par$environment,'x',par$sample)
name1=as.vector(par$all)
colnames(chao1)[3:dim(chao1)[2]]=name1
#use melt to make a stacked table
t3=melt(chao1,'sequence.depth' )
t4=t3[31:dim(t3)[1],]
colnames(t4)=c('sequence.depth','all','div')
#use strsplit to get variables for each sample from t4$all
#and make a new data frame named alpha to store the diversity
#and variables/sample names
list=strsplit(as.character(t4$all),'x')
df=ldply(list)
colnames(df)=c('plate','run','type','env','sample')
alpha=data.frame(t4[,1],t4[,3],df)
colnames(alpha)[1:2]=c('depth','div')
#do aovp on the data frame alpha
mod1=aovp(alpha$div~alpha$env+alpha$plate+alpha$run+alpha$type)
#Error: cannot allocate vector of size 76.4 Gb
#this error was for the original table, not the part of data shown here
再次更新:上面的脚本使用包含的示例数据运行良好。但是,使用所有实际数据时出现内存错误。所以我相信我将alpha多样性数据组织到这个长数据帧的方式效率不高,而且我在处理迭代和NA方面可能是错误的。有人可以帮我吗?谢谢。