Question

我需要对来自计划的一些生态样本的α多样性（样本内多样性）计算的输出表进行一些分析。

由于原始数据表太大，我只在这里显示部分数据，分别对应于下面脚本中提到的chao1和par数据帧：

list(sequence.depth = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 424, 424, 424, 
424, 424, 424, 424, 424, 424, 424), iteration = c(0, 1, 2, 3, 
4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 
5, 6, 7, 8, 9), sample1 = c(13, 55, 55, 23, 13, 55, 23, 29, 55, 
23, 623.5, 440, 545.7142857, 353, 515.4615385, 767.8, 602.2, 
354.3333333, 349.25, 508.7692308, 782.7142857, 648.3571429, 555.0344828, 
662.4166667, 651.037037, 491.0285714, 621.2857143, 600.7, 536.5714286, 
683.5517241), sample2 = c(23, 23, 55, 55, 55, 16, 23, 55, 13, 
55, 603, 408.5, 422.0769231, 328.4, 304.5, 379.2142857, 375, 
469, 483.1111111, 384.75, 807.8333333, 793, 545.75, 542.8, 599.1153846, 
555.1363636, 688, 655.5652174, 491.2758621, 673.1363636), sample3 = c(55, 
55, 55, 55, 55, 55, 55, 55, 55, 55, 942.6470588, 763.1818182, 
757.7142857, 817.5652174, 951.1176471, 622.3846154, 1065.75, 
1436.1, 771.4545455, 1240.428571, 1091.622222, 1305.571429, 1029.6, 
1349, 1282.357143, 1120.139535, 1362.153846, 1324.447368, 1151.923077, 
990.3958333), sample4 = c(55, 55, 55, 23, 55, 55, 55, 23, 55, 
55, 1134.4, 1245.2, 1233.4, 1112.066667, 680.2916667, 910.25, 
1216.071429, 1168.5, 853.4736842, 1180.5625, 1142.130435, 1438.157895, 
1074.222222, 1223.976744, 1062.163265, 1346.25, 1245.434783, 
1330.073171, 1134.022222, 1395.833333), sample5 = c(55, 55, 55, 
55, 55, 55, 29, 55, 55, 55, 711.55, 713.8947368, 656.3157895, 
695.8947368, 679.1052632, 880.0588235, 812.5555556, 959.5714286, 
551.4347826, 619.6363636, 895.5227273, 876.9772727, 884.2790698, 
895.8139535, 840.7659574, 882.1363636, 922.0238095, 894.5227273, 
859, 895.8139535), sample6 = c(55, 55, 55, 55, 55, 55, 55, 55, 
55, 55, 508.6071429, 852.0588235, 867, 773.0526316, 786.5, 1064.2, 
699.6363636, 892.1764706, 702.25, 948.8333333, 1139.4, 892.5294118, 
1331.684211, 1068, 1297.676471, 1024.27907, 1096.142857, 1148.227273, 
1031.046512, 935.5957447), sample7 = c(23, 13, 9, 29, 55, 8, 
29, 22, 8.5, 7, 241.6, 576.5, 140.3333333, 412.5, 370, 192.5454545, 
123.1111111, 263, 164.3333333, 223.5, 326.5, 442.3333333, 297.6875, 
448.2727273, 345.5, 640.3333333, 439.1, 359.6, 356.0714286, 292.8333333
), sample8 = c(22, 23, 23, 29, 23, 23, 55, 23, 29, 23, 582.0909091, 
333.5263158, 657, 337.8666667, 372.0714286, 470.5, 594.3, 553.0909091, 
368.55, 544.8, 689.3571429, 626.24, 941.5, 649.9642857, 651.5, 
632.4347826, 653.7777778, 767.7727273, 674.3913043, 713.7727273
), sample9 = c(55, 23, 23, 55, 23, 23, 55, 55, 55, 55, 368, 445.1666667, 
669.0909091, 339.6521739, 335.3, 635.4615385, 382.1578947, 538.6666667, 
425.3684211, 459.3125, 583.3636364, 624.1935484, 590.0909091, 
600, 731.2222222, 677.2758621, 619.1818182, 592, 747.7307692, 
619.5), sample10 = c(55, 23, 55, 55, 55, 23, 55, 13, 23, 55, 
1184.153846, 820.5, 971.0769231, 793.6470588, 1051.4, 811.5, 
540.64, 705.5555556, 880.0588235, 906.0666667, 1263.6, 956.2380952, 
1036.108108, 1013.029412, 1228.794118, 1005.263158, 937.4186047, 
1104.333333, 898.1162791, 1071.135135), sample11 = c(23, 23, 
55, 55, 23, 55, 55, 55, 55, 55, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))

structure(list(plate = structure(c(2L, 2L, 5L, 5L, 5L, 5L, 4L, 
2L, 3L, 1L, 1L), .Label = c("Plate1", "Plate2", "Plate3", "Plate4", 
"Plate5"), class = "factor"), sequence_run = structure(c(8L, 
10L, 6L, 5L, 4L, 3L, 2L, 9L, 11L, 1L, 7L), .Label = c("Run1", 
"Run10", "Run11", "Run12", "Run13", "Run14", "Run2", "Run4", 
"Run5", "Run6", "Run7"), class = "factor"), type = structure(c(2L, 
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("flower", 
"leaf"), class = "factor"), environment = structure(c(1L, 1L, 
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("field1", "field2", 
"field3"), class = "factor"), sample = structure(c(1L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 2L, 3L), .Label = c("sample1", "sample10", 
"sample11", "sample2", "sample3", "sample4", "sample5", "sample6", 
"sample7", "sample8", "sample9"), class = "factor")), .Names = c("plate", 
"sequence_run", "type", "environment", "sample"), class = "data.frame", row.names = c(NA, 
-11L))

我想要做的分析是将alpha多样性的变化划分为变量plate，sequence_run，sample_type和environment，我计划在R中使用基于排列的多元回归。

我原来的问题：我无法想象我应该如何为R准备这张桌子以便我可以进行分析。所以我的问题是，我应该如何组织这个输出表以便在R中进行分析？有没有办法在'aovp'函数中处理NA？

请注意，在第N个序列深度处从0到9的迭代是由于来自该样本的N个序列的随机采样，以由程序计算10次α分集。如果该样本中没有足够的序列，则程序生成“NA”。

感谢您的帮助！

更新：谢谢，格雷。当我第一次提出这个问题时，我没有任何线索。我已经尝试了这里显示的脚本将其组织成一个非常长的数据框，每行代表每个迭代和序列深度的每个样本的alpha多样性度量，以及其他四个参数（sequence_run，plate，type和environment）。但我不能做它，因为它太大了。所以我怀疑这是否是为我提议的分析组织数据的正确方法：

library(lmPerm)
library(reshape2)
library(plyr)
chao1=read.table(file.choose(),head=T,sep='\t')
par=read.table(file.choose(),head=T)

#make a new column in par so that par$all contains 
#all the variable names and sample names
par$all=paste0(par$plate,'x',par$sequence_run,'x',par$type,'x',par$environment,'x',par$sample)
name1=as.vector(par$all)
colnames(chao1)[3:dim(chao1)[2]]=name1

#use melt to make a stacked table
t3=melt(chao1,'sequence.depth' )
t4=t3[31:dim(t3)[1],]
colnames(t4)=c('sequence.depth','all','div')

#use strsplit to get variables for each sample from t4$all
#and make a new data frame named alpha to store the diversity 
#and variables/sample names
list=strsplit(as.character(t4$all),'x')
df=ldply(list)
colnames(df)=c('plate','run','type','env','sample')
alpha=data.frame(t4[,1],t4[,3],df)
colnames(alpha)[1:2]=c('depth','div')


#do aovp on the data frame alpha
mod1=aovp(alpha$div~alpha$env+alpha$plate+alpha$run+alpha$type)
#Error: cannot allocate vector of size 76.4 Gb
#this error was for the original table, not the part of data shown here

再次更新：上面的脚本使用包含的示例数据运行良好。但是，使用所有实际数据时出现内存错误。所以我相信我将alpha多样性数据组织到这个长数据帧的方式效率不高，而且我在处理迭代和NA方面可能是错误的。有人可以帮我吗？谢谢。

更新了R脚本_如何准备此表以便在R中进行分析？

0 个答案: