我正在重新问一个问题,我试图简化我的数据集并举例说明我想要的输出。如果这仍然很复杂,请随意发表评论,以帮助我澄清这一点。
我有一张表,其中我将具有类似rt和mz的功能分组。
orig_feat mz_mid rt_mid similar_feature
1 f_1 685.4350 466.5 f_1
2 f_2 260.1655 245.0 f_2
185 f_2 260.1665 256.5 f_185
408 f_2 260.1670 239.0 f_408
2334 f_2 260.1650 250.0 f_2334
3 f_3 288.1980 276.0 f_3
7 f_3 288.1990 289.0 f_7
414 f_3 288.1970 275.0 f_414
2181 f_3 288.1980 270.0 f_2181
2969 f_3 288.1965 297.5 f_2969
4 f_4 537.3915 454.5 f_4
2271 f_4 537.3965 435.5 f_2271
5 f_5 439.2990 153.5 f_5
6 f_6 325.0690 210.5 f_6
10 f_6 325.0685 227.0 f_10
747 f_6 325.0685 184.5 f_747
2068 f_6 325.0695 225.0 f_2068
2929 f_6 325.0685 218.0 f_2929
2970 f_6 325.0680 237.0 f_2970
31 f_7 288.1980 276.0 f_3
71 f_7 288.1990 289.0 f_7
4141 f_7 288.1970 275.0 f_414
21811 f_7 288.1980 270.0 f_2181
29691 f_7 288.1965 297.5 f_2969
我想列出一个包含每个组的条目的列表。具有相同$ orig_feat的所有行应该“分组”,并且对于这些“组”中的每一个,我想要所有特征的向量。请参阅下面的示例所需输出。
$grf_1
[1] "f_1"
$grf_2
[1] "f_2" "f_185" "f_408" "f_2334"
$grf_3
[1] "f_3" "f_7" "f_414" "f_2181" "f_2969"
$grf_4
[1] "f_4" "f_2771"
$grf_5
[1] "f_5"
$grf_6
[1] "f_6" "f_10" "f_747" "f_2068" "f_2929" "f_2970"
但重要的是我希望这是非冗余的(例如gf_3:包含f_7,f_414,f_2181,f_2696,因此当我到达f_7时,我不会为f_7创建一个组,因为f_3组已经包含所有功能在f_7组中)
下面是我的代码。目前,生成的输出在grf_3之后停止。 我不确定为什么它似乎过早退出循环。
mkFeatGroupsList<-function(simFeatsTab){
features_seen<-vector()
GroupingList<-list()
counter=1
for (i in 1:length(unique(simFeatsTab$orig_feat))){
orig_feat2Grp<-simFeatsTab$orig_feat[i]
if (orig_feat2Grp%in%features_seen == TRUE) next
matchingFeats<-subset(simFeatsTab,orig_feat==orig_feat2Grp)$feature
grFeatNm<-paste("grf_",counter,sep="")
GroupingList[[grFeatNm]]<-matchingFeats
features_seen<-c(features_seen,matchingFeats)
counter=counter+1
}
return(GroupingList)
}
如果你需要测试数据。
> dput(simFeatsTab.10.30.test)
structure(list(orig_feat = structure(c(1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L,
7L, 7L), .Label = c("f_1", "f_2", "f_3", "f_4", "f_5", "f_6",
"f_7"), class = "factor"), mz_mid = c(685.435, 260.1655, 260.1665,
260.167, 260.165, 288.198, 288.199, 288.197, 288.198, 288.1965,
537.3915, 537.3965, 439.299, 325.069, 325.0685, 325.0685, 325.0695,
325.0685, 325.068, 288.198, 288.199, 288.197, 288.198, 288.1965
), rt_mid = c(466.5, 245, 256.5, 239, 250, 276, 289, 275, 270,
297.5, 454.5, 435.5, 153.5, 210.5, 227, 184.5, 225, 218, 237,
276, 289, 275, 270, 297.5), similar_feature = c("f_1", "f_2",
"f_185", "f_408", "f_2334", "f_3", "f_7", "f_414", "f_2181",
"f_2969", "f_4", "f_2271", "f_5", "f_6", "f_10", "f_747", "f_2068",
"f_2929", "f_2970", "f_3", "f_7", "f_414", "f_2181", "f_2969"
)), .Names = c("orig_feat", "mz_mid", "rt_mid", "similar_feature"
), class = "data.frame", row.names = c("1", "2", "185", "408",
"2334", "3", "7", "414", "2181", "2969", "4", "2271", "5", "6",
"10", "747", "2068", "2929", "2970", "31", "71", "4141", "21811",
"29691"))
答案 0 :(得分:3)
另一种解决方案是使用CREATE TABLE ts.mindesc (
ticker text,
time timestamp,
close float,
high float,
low float,
numevents int,
open float,
source text,
value float,
PRIMARY KEY (ticker, time)
) WITH CLUSTERING ORDER BY (time DESC)
包:
igraph
答案 1 :(得分:1)
我继续这样做:
将您的数据框(我称之为feat
)拆分为orig_feat
使用sapply
获取相关功能
循环完成相关功能并消除重复
转化为:
feat.split <- split(feat, my.df$orig_feat)
sim.feat <- sapply(feat.split, function(x){x$similar_feature})
for (i in 2:length(sim.feat))
{
# Get all of the previous features
prev.feat <- do.call("c", sim.feat[1:(i-1)])
# Remove features already used
sim.feat[[i]] <- sim.feat[[i]][!sim.feat[[i]] %in% prev.feat]
}