可能有一个简单的解决方案,但我无法想到它。
我有一个包含两列的表 - NodeID和ClusterID。每个群集都由节点组成。节点可以存在于多个集群中。我想要做的是浏览ClusterID列表并对集群进行集群。
我经历的过程是:
然后我选择下一个起始ID并重复该过程。
这很好,但是有数千个集群,可能需要几个小时才能运行。因此,我想优化代码并行运行,但我很难过如何做到这一点没有问题。
假设您创建了一个静态主列表,您可以将clusterID列表拆分为多个部分,让每个worker使用相同的进程处理一个部分,但是从工作列表中删除找到的clusterID而不是主列表。这可以,但是当关联的clusterID显示在不同的工作人员列表中时,您将创建重复项。
如果我让它们全部运行,那么我将不得不返回并通过查看NodeID是否出现在多个列表中来检查重复项。这是可行的,但似乎效率低下。
有没有更有效的方法来完成这项工作?
master_UniqueClusterIDs <- structure(list(ClusterID = c("AlterRefRecord_100_100_16_150",
"AlterRefRecord_100_101_16_151", "AlterRefRecord_100_102_16_152",
"AlterRefRecord_100_103_16_153", "AlterRefRecord_100_105_16_155",
"AlterRefRecord_100_106_16_156", "AlterRefRecord_100_107_16_157",
"AlterRefRecord_100_108_16_158", "AlterRefRecord_100_10_16_58",
"AlterRefRecord_100_111_16_161", "AlterRefRecord_100_115_16_165",
"AlterRefRecord_100_119_16_169", "AlterRefRecord_100_120_16_170",
"AlterRefRecord_100_121_16_171", "AlterRefRecord_100_122_16_172",
"AlterRefRecord_100_131_16_181", "AlterRefRecord_100_133_16_183",
"AlterRefRecord_100_136_16_186", "AlterRefRecord_100_139_16_189",
"AlterRefRecord_100_13_16_61", "AlterRefRecord_100_14_16_62",
"AlterRefRecord_100_15_16_63", "AlterRefRecord_100_17_16_65",
"AlterRefRecord_100_1_16_48", "AlterRefRecord_100_20_16_70",
"AlterRefRecord_100_23_16_72", "AlterRefRecord_100_27_16_76",
"AlterRefRecord_100_28_16_77", "AlterRefRecord_100_29_16_78",
"AlterRefRecord_100_2_16_49", "AlterRefRecord_100_30_16_79",
"AlterRefRecord_100_31_16_80", "AlterRefRecord_100_32_16_81",
"AlterRefRecord_100_33_16_82", "AlterRefRecord_100_35_16_84",
"AlterRefRecord_100_38_16_87", "AlterRefRecord_100_39_16_88",
"AlterRefRecord_100_41_16_90", "AlterRefRecord_100_43_16_92",
"AlterRefRecord_100_44_16_93", "AlterRefRecord_100_47_16_96",
"AlterRefRecord_100_48_16_97", "AlterRefRecord_100_49_16_98",
"AlterRefRecord_100_4_16_52", "AlterRefRecord_100_54_16_103",
"AlterRefRecord_100_56_16_105", "AlterRefRecord_100_58_16_107",
"AlterRefRecord_100_59_16_108", "AlterRefRecord_100_62_16_111",
"AlterRefRecord_100_63_16_112", "AlterRefRecord_100_64_16_113",
"AlterRefRecord_100_65_16_114", "AlterRefRecord_100_6_16_54",
"AlterRefRecord_100_71_16_121", "AlterRefRecord_100_72_16_122",
"AlterRefRecord_100_74_16_124", "AlterRefRecord_100_78_16_128",
"AlterRefRecord_100_84_16_134", "AlterRefRecord_100_85_16_135",
"AlterRefRecord_100_86_16_136", "AlterRefRecord_100_87_16_137",
"AlterRefRecord_100_88_16_138", "AlterRefRecord_100_89_16_139",
"AlterRefRecord_100_8_16_56", "AlterRefRecord_100_90_16_140",
"AlterRefRecord_100_91_16_141", "AlterRefRecord_100_92_16_142",
"AlterRefRecord_100_93_16_143", "AlterRefRecord_100_94_16_144",
"AlterRefRecord_100_95_16_145", "AlterRefRecord_100_97_16_147",
"AlterRefRecord_100_99_16_149", "AlterRefRecord_101_101_16_151",
"AlterRefRecord_101_102_16_152", "AlterRefRecord_101_103_16_153",
"AlterRefRecord_101_105_16_155", "AlterRefRecord_101_106_16_156",
"AlterRefRecord_101_108_16_158", "AlterRefRecord_101_10_16_58",
"AlterRefRecord_101_115_16_165", "AlterRefRecord_101_119_16_169",
"AlterRefRecord_101_120_16_170", "AlterRefRecord_101_121_16_171",
"AlterRefRecord_101_122_16_172", "AlterRefRecord_101_131_16_181",
"AlterRefRecord_101_136_16_186", "AlterRefRecord_101_139_16_189",
"AlterRefRecord_101_13_16_61", "AlterRefRecord_101_15_16_63",
"AlterRefRecord_101_17_16_65", "AlterRefRecord_101_1_16_48",
"AlterRefRecord_101_20_16_70", "AlterRefRecord_101_23_16_72",
"AlterRefRecord_101_27_16_76", "AlterRefRecord_101_28_16_77",
"AlterRefRecord_101_30_16_79", "AlterRefRecord_101_31_16_80",
"AlterRefRecord_101_32_16_81", "AlterRefRecord_101_33_16_82",
"AlterRefRecord_101_35_16_84")), .Names = "ClusterID", row.names = c(NA,
100L), class = "data.frame")
WS_ClusterTable <- structure(list(NodeID = c(14240L, 133399L, 46191L, 15955L, 46531L,
38692L, 36740L, 11536L, 36966L, 43992L, 42118L, 12682L, 133206L,
25687L, 28591L, 129265L, 36848L, 44253L, 26883L, 32346L, 27122L,
23376L, 23432L, 31887L, 39870L, 99938L, 68767L, 37814L, 49133L,
26759L, 15957L, 32725L, 12758L, 45055L, 47234L, 12522L, 14671L,
42296L, 38910L, 46321L, 79613L, 32761L, 21281L, 51924L, 85561L,
16077L, 19069L, 16731L, 25087L, 24225L, 113682L, 27324L, 51568L,
55478L, 16468L, 51924L, 85561L, 18095L, 14734L, 115162L, 20198L,
52842L, 55552L, 41410L, 32734L, 23058L, 18259L, 51752L, 20268L,
11572L, 45063L, 20432L, 55151L, 43490L, 38843L, 89766L, 19283L,
31875L, 12352L, 38773L, 44337L, 31977L, 24609L, 38902L, 32049L,
41152L, 36610L, 20741L, 25882L, 14031L, 22963L, 41342L, 84910L,
37080L, 44297L, 26815L, 38627L, 51102L, 22480L, 39869L, 97999L,
68766L, 37828L, 14671L, 49224L, 15958L, 24890L, 42340L, 12564L,
42988L, 41671L, 36313L), ClusterID = c("AlterRefRecord_100_100_16_150",
"AlterRefRecord_100_100_16_150", "AlterRefRecord_100_101_16_151",
"AlterRefRecord_100_102_16_152", "AlterRefRecord_100_103_16_153",
"AlterRefRecord_100_105_16_155", "AlterRefRecord_100_106_16_156",
"AlterRefRecord_100_107_16_157", "AlterRefRecord_100_108_16_158",
"AlterRefRecord_100_10_16_58", "AlterRefRecord_100_111_16_161",
"AlterRefRecord_100_115_16_165", "AlterRefRecord_100_115_16_165",
"AlterRefRecord_100_119_16_169", "AlterRefRecord_100_120_16_170",
"AlterRefRecord_100_120_16_170", "AlterRefRecord_100_121_16_171",
"AlterRefRecord_100_122_16_172", "AlterRefRecord_100_131_16_181",
"AlterRefRecord_100_133_16_183", "AlterRefRecord_100_136_16_186",
"AlterRefRecord_100_139_16_189", "AlterRefRecord_100_13_16_61",
"AlterRefRecord_100_14_16_62", "AlterRefRecord_100_15_16_63",
"AlterRefRecord_100_15_16_63", "AlterRefRecord_100_17_16_65",
"AlterRefRecord_100_1_16_48", "AlterRefRecord_100_20_16_70",
"AlterRefRecord_100_23_16_72", "AlterRefRecord_100_27_16_76",
"AlterRefRecord_100_28_16_77", "AlterRefRecord_100_29_16_78",
"AlterRefRecord_100_2_16_49", "AlterRefRecord_100_30_16_79",
"AlterRefRecord_100_31_16_80", "AlterRefRecord_100_32_16_81",
"AlterRefRecord_100_33_16_82", "AlterRefRecord_100_35_16_84",
"AlterRefRecord_100_38_16_87", "AlterRefRecord_100_38_16_87",
"AlterRefRecord_100_39_16_88", "AlterRefRecord_100_41_16_90",
"AlterRefRecord_100_43_16_92", "AlterRefRecord_100_43_16_92",
"AlterRefRecord_100_44_16_93", "AlterRefRecord_100_47_16_96",
"AlterRefRecord_100_48_16_97", "AlterRefRecord_100_49_16_98",
"AlterRefRecord_100_4_16_52", "AlterRefRecord_100_4_16_52", "AlterRefRecord_100_54_16_103",
"AlterRefRecord_100_56_16_105", "AlterRefRecord_100_58_16_107",
"AlterRefRecord_100_59_16_108", "AlterRefRecord_100_62_16_111",
"AlterRefRecord_100_62_16_111", "AlterRefRecord_100_63_16_112",
"AlterRefRecord_100_64_16_113", "AlterRefRecord_100_64_16_113",
"AlterRefRecord_100_65_16_114", "AlterRefRecord_100_6_16_54",
"AlterRefRecord_100_71_16_121", "AlterRefRecord_100_72_16_122",
"AlterRefRecord_100_74_16_124", "AlterRefRecord_100_78_16_128",
"AlterRefRecord_100_84_16_134", "AlterRefRecord_100_85_16_135",
"AlterRefRecord_100_86_16_136", "AlterRefRecord_100_87_16_137",
"AlterRefRecord_100_88_16_138", "AlterRefRecord_100_89_16_139",
"AlterRefRecord_100_8_16_56", "AlterRefRecord_100_90_16_140",
"AlterRefRecord_100_91_16_141", "AlterRefRecord_100_91_16_141",
"AlterRefRecord_100_92_16_142", "AlterRefRecord_100_93_16_143",
"AlterRefRecord_100_94_16_144", "AlterRefRecord_100_95_16_145",
"AlterRefRecord_100_97_16_147", "AlterRefRecord_100_99_16_149",
"AlterRefRecord_101_101_16_151", "AlterRefRecord_101_102_16_152",
"AlterRefRecord_101_103_16_153", "AlterRefRecord_101_105_16_155",
"AlterRefRecord_101_106_16_156", "AlterRefRecord_101_108_16_158",
"AlterRefRecord_101_10_16_58", "AlterRefRecord_101_115_16_165",
"AlterRefRecord_101_119_16_169", "AlterRefRecord_101_120_16_170",
"AlterRefRecord_101_120_16_170", "AlterRefRecord_101_121_16_171",
"AlterRefRecord_101_122_16_172", "AlterRefRecord_101_131_16_181",
"AlterRefRecord_101_136_16_186", "AlterRefRecord_101_139_16_189",
"AlterRefRecord_101_13_16_61", "AlterRefRecord_101_15_16_63",
"AlterRefRecord_101_15_16_63", "AlterRefRecord_101_17_16_65",
"AlterRefRecord_101_1_16_48", "AlterRefRecord_101_20_16_70",
"AlterRefRecord_101_23_16_72", "AlterRefRecord_101_27_16_76",
"AlterRefRecord_101_28_16_77", "AlterRefRecord_101_30_16_79",
"AlterRefRecord_101_31_16_80", "AlterRefRecord_101_32_16_81",
"AlterRefRecord_101_33_16_82", "AlterRefRecord_101_35_16_84")), .Names = c("NodeID",
"ClusterID"), row.names = c(NA, -112L), class = "data.frame")
library(doParallel)
library(foreach)
SplitLists <- split(master_UniqueClusterIDs, sample(1:32, nrow(master_UniqueClusterIDs), replace=T))
registerDoParallel(7)
getDoParWorkers()
foreach(ggg = 1:32 ) %dopar% {
print(ggg)
UniqueClusterIDs <- SplitLists[[ggg]][,1]
BS_ClusterID <- 1 ### Start
BS_ClusterTable <- data.frame()
while(length(UniqueClusterIDs)>0){
print(paste(ggg, "big", length(UniqueClusterIDs)))
NodeIDs_start <- WS_ClusterTable[WS_ClusterTable$ClusterID == UniqueClusterIDs[1],]$NodeID
######################################
Continue <- 1
# While number of clusters and the number of node_ids grows, continue searching
while(Continue == 1 ) {
#######
ClusterIDs <- WS_ClusterTable[WS_ClusterTable$NodeID %in% NodeIDs_start[,1],]$ClusterID
NodeIDs_2 <- WS_ClusterTable[WS_ClusterTable$ClusterID %in% ClusterIDs[,1],]$NodeID
if( nrow(NodeIDs_2) > nrow(NodeIDs_start)){
NodeIDs_start <- NodeIDs_2
Continue <- 1
} else {
Continue <- 0
#### Now that you have reached the end of this cluster, Insert it and then remove
BS_ClusterID <- BS_ClusterID+1
FinalID <- paste("BS_ClusterID",BS_ClusterID,ggg,sep="_")
NodeIDs_2 ## all the nodes
NewRows <- data.frame(NodeID= unique(NodeIDs_2), BS_ClusterID=FinalID)
BS_ClusterTable <- rbind(BS_ClusterTable,NewRows)
### Remove all listed clusters from the clusterlist
UniqueClusterIDs <- UniqueClusterIDs[! UniqueClusterIDs %in% ClusterIDs[,1]]
}
}
}
}