大量数据集的分布比较

时间:2014-04-14 13:31:28

标签: r distribution

我的数据名为“data”,如下所示:

CENTRE_Blinded  val_list
1   1104    c(-13, -1, 0, 28, -88, 28, -1, -6, -5, -58, 28, 28, 28, 28, 2, 0, 28, -26, 28, 28, 2, 28,    28, -2, -29, 0, 28, -34, -6, 0, 28, 1, 0, 0, -1, 28, 28, 0, 28, 6, 28, 0, 28, 28, 28, 0, -2, -6, -1, 4, 6, 1, -16, -7, 2, 3, 7, 0, 1, 11, 0, 1, -6, -5, 0, 3, 8, 7, 0, 0, 6, -6, 2, 36, -8, 0, -7, -7, -1, -1, -1, 7, -3, 7, 2)
2   1204    c(2, -9, 28, 28, -2, 1, -3, -1, 0, 28, 28, 28, 28, 28, 28, 3, 10, -5, -8, 9, -8, 0, 13, 0, -1, 2, -1, 0, 6, 1, 0, -7, 6, -6, 1)
3   1403    c(0, 2, 0, 2, 28, 0, -1, -35, -36, 2, 1, 1, 28, 28, 28, 0, 0, 28, -7, -35, 28, -3, -18, 28, 28, 28, -5, 0, 28, -2, 4, 5, 0, 56, 1, 0, 1, -7, -20, 0, 0, -3, 0, 1, 3, 0, 4, -2, 42, -13, 7, 10, 7, 56, 0, -5, 10, 56, 8, 56, 84, -4, 1, 0, -14, -7, -1, -48, -6, -3, 0, 7)
4   1110    c(0, 1, 0, -3, 28, 28, 0, -5, 0, 9, 15, 56, -11, -1, -7)

第一列包含中心的ID,第二列包含值列表。 我想从每个中心的这些值建立经验分布,并使用例如它们来比较它们。 kolmogorov-smirnov test(k中的ks.test)。 这样我就可以得到N×N矩阵的p值。 我的问题是如何做到这一点并保留每个k-s测试中心的ID。

我的尝试是:

val_list_temp = as.list(data, by = "CENTRE_Blinded"))
val_list = val_list_temp[[2]]
names(val_list) = val_list_temp[[1]]

这里我有每个中心的ID,但是当我使用expand.grid时,我不再知道如何存储它们了:

val_table = as.data.table(expand.grid(val_list, val_list))
ks_tests = apply(X = val_table, 1, function(x) ks.test(unlist(x[1]),unlist(x[2])))

此外,我以后如何将它放在矩阵中,做一些可视化?

P.S。也许有比使用kolmogorov-smirnov测试和制作p值矩阵更好的方法吗?

1 个答案:

答案 0 :(得分:0)

这是您正在寻找的解决方案类型吗?它循环6次(4选2)然后使矩阵对称。

#Data you provided
A1104 <- c(-13, -1, 0, 28, -88, 28, -1, -6, -5, -58, 28, 28, 28, 28, 2, 0, 28, -26, 28, 28, 2, 
       28,28, -2, -29, 0, 28, -34, -6, 0, 28, 1, 0, 0, -1, 28, 28, 0, 28, 6, 28, 0, 28, 
       28, 28, 0, -2, -6, -1, 4, 6, 1, -16, -7, 2, 3, 7, 0, 1, 11, 0, 1, -6, -5, 0, 3, 8, 7, 
       0, 0, 6, -6, 2, 36, -8, 0, -7, -7, -1, -1, -1, 7, -3, 7, 2)
A1204 <- c(2, -9, 28, 28, -2, 1, -3, -1, 0, 28, 28, 28, 28, 28, 28, 3, 10, -5, -8, 9, -8, 0, 13,
       0, -1, 2, -1, 0, 6, 1, 0, -7, 6, -6, 1)
A1403 <- c(0, 2, 0, 2, 28, 0, -1, -35, -36, 2, 1, 1, 28, 28, 28, 0, 0, 28, -7, -35, 28, -3, -18, 
       28, 28, 28, -5, 0, 28, -2, 4, 5, 0, 56, 1, 0, 1, -7, -20, 0, 0, -3, 0, 1, 3, 0, 4, -2, 
       42, -13, 7, 10, 7, 56, 0, -5, 10, 56, 8, 56, 84, -4, 1, 0, -14, -7, -1, -48, -6, -3, 0, 7)
A1110 <- c(0, 1, 0, -3, 28, 28, 0, -5, 0, 9, 15, 56, -11, -1, -7)

data <- list(A1104,A1204,A1403,A1110)
names(data) <- c("A1104","A1204","A1403","A1110")

#Provide combinations that we will want to compare
VarComb = combn(1:length(data), 2)

#Create matrix of all zeros and then populate with ks.test
Result.Matrix <- matrix(0,nrow=length(data),ncol=length(data))
for(i in 1:(dim(VarComb)[2])){ 
  Result.Matrix[VarComb[1,i],VarComb[2,i]] <- ks.test(data[[VarComb[1,i]]],data[[VarComb[2,i]]])$p.value
}

Result.Matrix <- Result.Matrix+t(Result.Matrix)
row.names(Result.Matrix) <- names(data)
colnames(Result.Matrix) <- names(data)
Result.Matrix