比较列表中的元素

时间:2015-07-12 16:07:26

标签: r dynamic time

我有一个包含两个向量的22个列表的列表。每个矢量在第二列表中具有相等的长度,但是相对于彼此具有不相等的长度。请参见最后输出的输出。

我想通过(i)找到最长的向量来构建data.frame,(ii)将2个向量列表中的所有其他第1个向量与最长向量中的第一个进行比较,并用NA值填充任何遗漏。我想在data.frame中将得到的第二个向量(包括它们的NA)组合在一起。

(数据集是相对保留时间和百分比面积。我想将所有色谱图相互比较)。

我可以使用以下代码回答我的问题的第一部分(i),

max(sapply(lapply(A, function(x) x[1,]),length))

告诉我列表中的最后一个数据集是最长的。

现在我想坚持使用动态时间扭曲(dtw())包并应用这里给出的答案(No 3)dtw link,但我发现它似乎只适用于相同的向量长度。

structure(list(chrom_a = structure(c(0.4663, 11.16, 0.4955, 0.82, 
0.8744, 4.92, 0.9101, 1.55, 1, 73.91, 1.2862, 5, 1.3198, 1.06, 
1.4302, 1.57), .Dim = c(2L, 8L), .Dimnames = list(c("rel.ret", 
"rel.area"), c("2", "3", "4", "5", "6", "7", "8", "9"))), chrom_b =structure(c(0.465, 
8.6, 0.4938, 0.64, 0.8714, 5.1, 0.907, 1.49, 1, 76.22, 1.2813, 
4.92, 1.3144, 1.03, 1.3704, 0.56, 1.4245, 1.45), .Dim = c(2L, 
9L), .Dimnames = list(c("rel.ret", "rel.area"), c("3", "4", "5", 
"6", "7", "8", "9", "10", "11"))), chrom_c = structure(c(0.4654, 
7.58, 0.4943, 0.58, 0.8724, 5.12, 0.9082, 1.51, 1, 77.05, 1.2835, 
4.97, 1.3168, 1.07, 1.3732, 0.57, 1.4272, 1.54), .Dim = c(2L, 
9L), .Dimnames = list(c("rel.ret", "rel.area"), c("2", "3", "4", 
"5", "6", "7", "8", "9", "10"))), chrom_d = structure(c(0.3841, 
0.28, 0.4638, 6.07, 0.4926, 0.48, 0.7903, 0.29, 0.8694, 4.79, 
0.9049, 1.38, 1, 67, 1.2783, 4.62, 1.3116, 0.95, 1.3681, 0.51, 
1.4215, 13.29, 3.5744, 0.35), .Dim = c(2L, 12L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10", "11", "12", "13", "14"))), chrom_e = structure(c(0.5304, 
4.68, 0.8727, 5.34, 0.9086, 1.57, 1, 79.92, 1.2838, 5.22, 1.3168, 
1.13, 1.3742, 0.58, 1.4279, 1.57), .Dim = c(2L, 8L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("2", "3", "4", "5", "6", "7", 
    "8", "9"))), chrom_f = structure(c(0.3718, 0.36, 0.4628, 
3.38, 0.4915, 0.32, 0.789, 0.42, 0.8685, 6.15, 0.904, 1.72, 0.9399, 
0.34, 1, 74.63, 1.0651, 0.5, 1.1014, 0.35, 1.1266, 0.29, 1.1609, 
0.41, 1.2108, 0.32, 1.2774, 6.06, 1.3103, 1.36, 1.3667, 0.96, 
1.4201, 1.63, 3.5693, 0.48, 3.6497, 0.34), .Dim = c(2L, 19L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", 
    "19", "20", "21"))), chrom_g = structure(c(0.465, 1.44, 0.8708, 
5.4, 0.9064, 1.53, 1, 82.64, 1.2805, 5.3, 1.3134, 1.16, 1.37, 
0.55, 1.4239, 1.54, 3.5792, 0.44), .Dim = c(2L, 9L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10", "11"))), chrom_h = structure(c(0.4644, 7.08, 0.4932, 
0.55, 0.8708, 5.15, 0.9064, 1.49, 1, 77.57, 1.2805, 5.04, 1.3142, 
1.08, 1.3704, 0.53, 1.4245, 1.51), .Dim = c(2L, 9L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10", "11"))), chrom_i = structure(c(0.4641, 6.82, 0.493, 
0.54, 0.8714, 5.24, 0.9075, 1.46, 1, 78.08, 1.2826, 4.91, 1.3164, 
1.02, 1.3722, 0.51, 1.4266, 1.41), .Dim = c(2L, 9L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10", "11"))), chrom_j = structure(c(74.173, 31.69, 
0.3729, 0.25, 0.4637, 4.36, 0.4925, 0.35, 0.7904, 0.21, 0.8697, 
3.9, 0.9055, 1.06, 1, 52.76, 1.279, 3.56, 1.3119, 0.69, 1.369, 
0.3, 1.4224, 0.87), .Dim = c(2L, 12L), .Dimnames = list(c("rel.ret", 
"rel.area"), c("2", "3", "4", "5", "6", "7", "8", "9", "10", 
"11", "12", "13"))), chrom_k = structure(c(0.4661, 5.91, 0.8743, 
5.23, 0.9105, 1.53, 1, 79.89, 1.2863, 5.11, 1.3822, 0.99, 1.0169, 
1.33), .Dim = c(2L, 7L), .Dimnames = list(c("rel.ret", "rel.area"
), c("2", "3", "4", "5", "6", "7", "8"))), chrom_l = structure(logical(0), .Dim = c(2L, 
0L), .Dimnames = list(c("rel.ret", "rel.area"), NULL)), chrom_m = structure(logical(0), .Dim = c(2L, 
0L), .Dimnames = list(c("rel.ret", "rel.area"), NULL)), chrom_n = structure(c(0.3793, 
0.37, 0.4604, 6.12, 0.4896, 0.49, 0.8709, 8.41, 0.9072, 1.46, 
1, 76.51, 1.2844, 4.75, 1.3188, 0.83, 1.429, 1.06), .Dim = c(2L, 
9L), .Dimnames = list(c("rel.ret", "rel.area"), c("3", "4", "5", 
"6", "7", "8", "9", "10", "11"))), chrom_o = structure(c(0.1252, 
43.23, 0.4625, 2.06, 0.8728, 3, 0.9094, 0.88, 1, 46.69, 1.2865, 
2.91, 1.3209, 0.54, 1.4306, 0.7), .Dim = c(2L, 8L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("2", "3", "4", "5", "6", "7", 
    "8", "9"))), chrom_p = structure(c(0.1248, 31.52, 0.3796, 
0.23, 0.4608, 1.18, 0.7891, 0.25, 0.8687, 3.88, 0.905, 1.04, 
1, 55.93, 1.2808, 3.72, 1.3149, 0.8, 1.3709, 0.38, 1.4244, 1.06
), .Dim = c(2L, 11L), .Dimnames = list(c("rel.ret", "rel.area"
), c("2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"
))), chrom_q = structure(c(0.4618, 0.74, 0.8702, 5.49, 0.9062, 
1.44, 1, 83.93, 1.2837, 5.21, 1.3181, 1.13, 1.3738, 0.53, 1.428, 
1.53), .Dim = c(2L, 8L), .Dimnames = list(c("rel.ret", "rel.area"
), c("3", "4", "5", "6", "7", "8", "9", "10"))), chrom_r = structure(c(0.8715, 
5.39, 0.9072, 1.4, 1, 84.41, 1.2821, 5.18, 1.3151, 1.14, 1.3722, 
0.63, 1.4258, 1.46, 3.5918, 0.39), .Dim = c(2L, 8L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10"))), chrom_s = structure(c(0.3722, 0.38, 0.4636, 
6.12, 0.492, 0.5, 0.7898, 0.42, 0.8687, 5.94, 0.9047, 1.69, 0.9403, 
0.33, 1, 73.12, 1.0642, 0.44, 1.1033, 0.32, 1.1276, 0.25, 1.162, 
0.33, 1.2787, 5.87, 1.3119, 1.32, 1.3685, 0.76, 1.422, 1.5, 3.5817, 
0.42, 3.662, 0.3), .Dim = c(2L, 18L), .Dimnames = list(c("rel.ret", 
"rel.area"), c("3", "4", "5", "6", "7", "8", "9", "10", "11", 
"12", "13", "14", "15", "16", "17", "18", "19", "20"))), chrom_t = structure(c(0.3738, 
0.37, 0.4651, 1.51, 0.7917, 0.34, 0.8711, 5.49, 0.9071, 1.51, 
1, 81.91, 1.2834, 5.33, 1.3166, 1.13, 1.3736, 0.57, 1.4283, 1.41, 
3.6102, 0.43), .Dim = c(2L, 11L), .Dimnames = list(c("rel.ret", 
"rel.area"), c("3", "4", "5", "6", "7", "8", "9", "10", "11", 
"12", "13"))), chrom_u = structure(c(0.3844, 0.38, 0.4647, 2.2, 
0.7909, 0.37, 0.8699, 5.99, 0.906, 1.63, 1, 79.36, 1.2817, 5.86, 
1.3149, 1.23, 1.372, 0.67, 1.4257, 1.56, 3.6037, 0.45, 3.684, 
0.3), .Dim = c(2L, 12L), .Dimnames = list(c("rel.ret", "rel.area"
), c("3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14"))), chrom_v = structure(c(0.3836, 0.45, 0.4644, 0.62, 0.7894, 
0.58, 0.8685, 7.17, 0.9043, 1.85, 0.941, 0.43, 1, 73.3, 1.0663, 
0.31, 1.0849, 0.23, 1.1035, 0.4, 1.1284, 0.31, 1.1625, 0.5, 1.2133, 
0.36, 1.2381, 0.32, 1.2795, 7.2, 1.3126, 1.64, 1.37, 1.13, 1.423, 
2.05, 3.5963, 0.71, 3.6776, 0.45), .Dim = c(2L, 20L), .Dimnames = list(
    c("rel.ret", "rel.area"), c("3", "4", "5", "6", "7", "8", 
    "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", 
    "19", "20", "21", "22")))), .Names = c("chrom_a", "chrom_b", 
"chrom_c", "chrom_d", "chrom_e", "chrom_f", "chrom_g", "chrom_h", 
"chrom_i", "chrom_j", "chrom_k", "chrom_l", "chrom_m", "chrom_n", 
"chrom_o", "chrom_p", "chrom_q", "chrom_r", "chrom_s", "chrom_t", 
"chrom_u", "chrom_v"))

3 个答案:

答案 0 :(得分:2)

mx <- tail(colnames(A[[which.max(lengths(A))]]),1)
newA <- lapply(A, function(x) apply(x, 1, function(x) {length(x) <- mx;x}))

这会将行的长度扩展到最大行长度。

答案 1 :(得分:2)

这是一个建议,我认为提供了所需的输出(如果我正确理解任务)。

longest_vec <-  A[[which.max(lengths(A))]][1,]
m <- matrix(NA, ncol = as.integer(names(longest_vec[length(longest_vec)])), nrow = length(A))
for (i in 1:nrow(m)) m[i, as.integer(colnames(A[[i]]))] <- unname(A[[i]][2,])
df <- as.data.frame(m)
df <- df[,-1]
colnames(df) <- c(2, names(longest_vec))
#> head(df)
#      2    3    4     5     6     7    8     9    10   11   12    13   14   15   16   17   18   19   20   21 22
#1 11.16 0.82 4.92  1.55 73.91  5.00 1.06  1.57    NA   NA   NA    NA   NA   NA   NA   NA   NA   NA   NA   NA NA
#2    NA 8.60 0.64  5.10  1.49 76.22 4.92  1.03  0.56 1.45   NA    NA   NA   NA   NA   NA   NA   NA   NA   NA NA
#3  7.58 0.58 5.12  1.51 77.05  4.97 1.07  0.57  1.54   NA   NA    NA   NA   NA   NA   NA   NA   NA   NA   NA NA
#4    NA 0.28 6.07  0.48  0.29  4.79 1.38 67.00  4.62 0.95 0.51 13.29 0.35   NA   NA   NA   NA   NA   NA   NA NA
#5  4.68 5.34 1.57 79.92  5.22  1.13 0.58  1.57    NA   NA   NA    NA   NA   NA   NA   NA   NA   NA   NA   NA NA
#6    NA 0.36 3.38  0.32  0.42  6.15 1.72  0.34 74.63 0.50 0.35  0.29 0.41 0.32 6.06 1.36 0.96 1.63 0.48 0.34 NA

希望这有帮助。

PS:我从@Pierre Lafortune的优秀评论中偷走了代码的第一行。

答案 2 :(得分:2)

我以与你最初追求的方式略有不同的方式解决了这个问题,但目标却相同。我的假设是您有一个色谱图列表,其中包含具有保留时间和面积的峰,并且您希望最终能够比较不同色谱图中的峰。

首先转换为长data.table

chroms_long <- rbindlist(lapply(chroms, function(x) {
  as.data.table(t(x))
  }))
chroms_long[, chrom := rep(names(chroms), lengths(chroms) / 2)]

然后使用分层聚类分组。此调用中的截止h需要针对基础数据进行更改,但对于此数据集,0.015似乎是正确的。

setkey(chroms_long, rel.ret, chrom)
chroms_long[, group := cutree(hclust(dist(rel.ret)), h = 0.015)]

如果您现在想要为每个可能的色谱图/组添加一行:

setkey(chroms_long, chrom, group)
chroms_long <- chroms_long[CJ(unique(chrom), unique(group))]

如果您还想知道每组的平均保留时间:

chroms_long[, mean_rel.ret := mean(rel.ret, na.rm = TRUE), by = group]

如果您想要宽屏输出:

dcast.data.table(chroms_long, chrom ~ group, value.var = "rel.area")

如果不是组号,您希望将平均保留作为列标题:

dcast.data.table(chroms_long[, .(chrom, col = sprintf("%0.2f", mean_rel.ret), rel.area)],
  chrom ~ col, value.var = "rel.area")

如果您想恢复原始格式:

sapply(unique(chroms_long$chrom),
  function(x) t(data.matrix(chroms_long[x, .(rel.ret, rel.area)])),
  simplify = FALSE)

请注意,这会将sapply用于USE.NAMES功能,而不是简化列表中的输出。