如何并行聚类值?

时间:2018-02-07 19:37:41

标签: python r algorithm parallel-processing cluster-analysis

可能有一个简单的解决方案,但我无法想到它。

我有一个包含两列的表 - NodeID和ClusterID。每个群集都由节点组成。节点可以存在于多个集群中。我想要做的是浏览ClusterID列表并对集群进行集群。

我经历的过程是:

  1. 选择第一个ClusterID,然后选择与该ClusterID相关联的所有NodeID
  2. 选择与这些NodeID关联的所有ClusterID。
  3. 我重复此操作,直到NodeID列表的大小没有增加。
  4. 然后我返回到ClusterID的完整列表,我删除了工作列表中出现的所有内容。
  5. 然后我选择下一个起始ID并重复该过程。

    这很好,但是有数千个集群,可能需要几个小时才能运行。因此,我想优化代码并行运行,但我很难过如何做到这一点没有问题。

    假设您创建了一个静态主列表,您可以将clusterID列表拆分为多个部分,让每个worker使用相同的进程处理一个部分,但是从工作列表中删除找到的clusterID而不是主列表。这可以,但是当关联的clusterID显示在不同的工作人员列表中时,您将创建重复项。

    如果我让它们全部运行,那么我将不得不返回并通过查看NodeID是否出现在多个列表中来检查重复项。这是可行的,但似乎效率低下。

    有没有更有效的方法来完成这项工作?

            master_UniqueClusterIDs <- structure(list(ClusterID = c("AlterRefRecord_100_100_16_150", 
    "AlterRefRecord_100_101_16_151", "AlterRefRecord_100_102_16_152", 
    "AlterRefRecord_100_103_16_153", "AlterRefRecord_100_105_16_155", 
    "AlterRefRecord_100_106_16_156", "AlterRefRecord_100_107_16_157", 
    "AlterRefRecord_100_108_16_158", "AlterRefRecord_100_10_16_58", 
    "AlterRefRecord_100_111_16_161", "AlterRefRecord_100_115_16_165", 
    "AlterRefRecord_100_119_16_169", "AlterRefRecord_100_120_16_170", 
    "AlterRefRecord_100_121_16_171", "AlterRefRecord_100_122_16_172", 
    "AlterRefRecord_100_131_16_181", "AlterRefRecord_100_133_16_183", 
    "AlterRefRecord_100_136_16_186", "AlterRefRecord_100_139_16_189", 
    "AlterRefRecord_100_13_16_61", "AlterRefRecord_100_14_16_62", 
    "AlterRefRecord_100_15_16_63", "AlterRefRecord_100_17_16_65", 
    "AlterRefRecord_100_1_16_48", "AlterRefRecord_100_20_16_70", 
    "AlterRefRecord_100_23_16_72", "AlterRefRecord_100_27_16_76", 
    "AlterRefRecord_100_28_16_77", "AlterRefRecord_100_29_16_78", 
    "AlterRefRecord_100_2_16_49", "AlterRefRecord_100_30_16_79", 
    "AlterRefRecord_100_31_16_80", "AlterRefRecord_100_32_16_81", 
    "AlterRefRecord_100_33_16_82", "AlterRefRecord_100_35_16_84", 
    "AlterRefRecord_100_38_16_87", "AlterRefRecord_100_39_16_88", 
    "AlterRefRecord_100_41_16_90", "AlterRefRecord_100_43_16_92", 
    "AlterRefRecord_100_44_16_93", "AlterRefRecord_100_47_16_96", 
    "AlterRefRecord_100_48_16_97", "AlterRefRecord_100_49_16_98", 
    "AlterRefRecord_100_4_16_52", "AlterRefRecord_100_54_16_103", 
    "AlterRefRecord_100_56_16_105", "AlterRefRecord_100_58_16_107", 
    "AlterRefRecord_100_59_16_108", "AlterRefRecord_100_62_16_111", 
    "AlterRefRecord_100_63_16_112", "AlterRefRecord_100_64_16_113", 
    "AlterRefRecord_100_65_16_114", "AlterRefRecord_100_6_16_54", 
    "AlterRefRecord_100_71_16_121", "AlterRefRecord_100_72_16_122", 
    "AlterRefRecord_100_74_16_124", "AlterRefRecord_100_78_16_128", 
    "AlterRefRecord_100_84_16_134", "AlterRefRecord_100_85_16_135", 
    "AlterRefRecord_100_86_16_136", "AlterRefRecord_100_87_16_137", 
    "AlterRefRecord_100_88_16_138", "AlterRefRecord_100_89_16_139", 
    "AlterRefRecord_100_8_16_56", "AlterRefRecord_100_90_16_140", 
    "AlterRefRecord_100_91_16_141", "AlterRefRecord_100_92_16_142", 
    "AlterRefRecord_100_93_16_143", "AlterRefRecord_100_94_16_144", 
    "AlterRefRecord_100_95_16_145", "AlterRefRecord_100_97_16_147", 
    "AlterRefRecord_100_99_16_149", "AlterRefRecord_101_101_16_151", 
    "AlterRefRecord_101_102_16_152", "AlterRefRecord_101_103_16_153", 
    "AlterRefRecord_101_105_16_155", "AlterRefRecord_101_106_16_156", 
    "AlterRefRecord_101_108_16_158", "AlterRefRecord_101_10_16_58", 
    "AlterRefRecord_101_115_16_165", "AlterRefRecord_101_119_16_169", 
    "AlterRefRecord_101_120_16_170", "AlterRefRecord_101_121_16_171", 
    "AlterRefRecord_101_122_16_172", "AlterRefRecord_101_131_16_181", 
    "AlterRefRecord_101_136_16_186", "AlterRefRecord_101_139_16_189", 
    "AlterRefRecord_101_13_16_61", "AlterRefRecord_101_15_16_63", 
    "AlterRefRecord_101_17_16_65", "AlterRefRecord_101_1_16_48", 
    "AlterRefRecord_101_20_16_70", "AlterRefRecord_101_23_16_72", 
    "AlterRefRecord_101_27_16_76", "AlterRefRecord_101_28_16_77", 
    "AlterRefRecord_101_30_16_79", "AlterRefRecord_101_31_16_80", 
    "AlterRefRecord_101_32_16_81", "AlterRefRecord_101_33_16_82", 
    "AlterRefRecord_101_35_16_84")), .Names = "ClusterID", row.names = c(NA, 
    100L), class = "data.frame")
    
            WS_ClusterTable <- structure(list(NodeID = c(14240L, 133399L, 46191L, 15955L, 46531L, 
    38692L, 36740L, 11536L, 36966L, 43992L, 42118L, 12682L, 133206L, 
    25687L, 28591L, 129265L, 36848L, 44253L, 26883L, 32346L, 27122L, 
    23376L, 23432L, 31887L, 39870L, 99938L, 68767L, 37814L, 49133L, 
    26759L, 15957L, 32725L, 12758L, 45055L, 47234L, 12522L, 14671L, 
    42296L, 38910L, 46321L, 79613L, 32761L, 21281L, 51924L, 85561L, 
    16077L, 19069L, 16731L, 25087L, 24225L, 113682L, 27324L, 51568L, 
    55478L, 16468L, 51924L, 85561L, 18095L, 14734L, 115162L, 20198L, 
    52842L, 55552L, 41410L, 32734L, 23058L, 18259L, 51752L, 20268L, 
    11572L, 45063L, 20432L, 55151L, 43490L, 38843L, 89766L, 19283L, 
    31875L, 12352L, 38773L, 44337L, 31977L, 24609L, 38902L, 32049L, 
    41152L, 36610L, 20741L, 25882L, 14031L, 22963L, 41342L, 84910L, 
    37080L, 44297L, 26815L, 38627L, 51102L, 22480L, 39869L, 97999L, 
    68766L, 37828L, 14671L, 49224L, 15958L, 24890L, 42340L, 12564L, 
    42988L, 41671L, 36313L), ClusterID = c("AlterRefRecord_100_100_16_150", 
    "AlterRefRecord_100_100_16_150", "AlterRefRecord_100_101_16_151", 
    "AlterRefRecord_100_102_16_152", "AlterRefRecord_100_103_16_153", 
    "AlterRefRecord_100_105_16_155", "AlterRefRecord_100_106_16_156", 
    "AlterRefRecord_100_107_16_157", "AlterRefRecord_100_108_16_158", 
    "AlterRefRecord_100_10_16_58", "AlterRefRecord_100_111_16_161", 
    "AlterRefRecord_100_115_16_165", "AlterRefRecord_100_115_16_165", 
    "AlterRefRecord_100_119_16_169", "AlterRefRecord_100_120_16_170", 
    "AlterRefRecord_100_120_16_170", "AlterRefRecord_100_121_16_171", 
    "AlterRefRecord_100_122_16_172", "AlterRefRecord_100_131_16_181", 
    "AlterRefRecord_100_133_16_183", "AlterRefRecord_100_136_16_186", 
    "AlterRefRecord_100_139_16_189", "AlterRefRecord_100_13_16_61", 
    "AlterRefRecord_100_14_16_62", "AlterRefRecord_100_15_16_63", 
    "AlterRefRecord_100_15_16_63", "AlterRefRecord_100_17_16_65", 
    "AlterRefRecord_100_1_16_48", "AlterRefRecord_100_20_16_70", 
    "AlterRefRecord_100_23_16_72", "AlterRefRecord_100_27_16_76", 
    "AlterRefRecord_100_28_16_77", "AlterRefRecord_100_29_16_78", 
    "AlterRefRecord_100_2_16_49", "AlterRefRecord_100_30_16_79", 
    "AlterRefRecord_100_31_16_80", "AlterRefRecord_100_32_16_81", 
    "AlterRefRecord_100_33_16_82", "AlterRefRecord_100_35_16_84", 
    "AlterRefRecord_100_38_16_87", "AlterRefRecord_100_38_16_87", 
    "AlterRefRecord_100_39_16_88", "AlterRefRecord_100_41_16_90", 
    "AlterRefRecord_100_43_16_92", "AlterRefRecord_100_43_16_92", 
    "AlterRefRecord_100_44_16_93", "AlterRefRecord_100_47_16_96", 
    "AlterRefRecord_100_48_16_97", "AlterRefRecord_100_49_16_98", 
    "AlterRefRecord_100_4_16_52", "AlterRefRecord_100_4_16_52", "AlterRefRecord_100_54_16_103", 
    "AlterRefRecord_100_56_16_105", "AlterRefRecord_100_58_16_107", 
    "AlterRefRecord_100_59_16_108", "AlterRefRecord_100_62_16_111", 
    "AlterRefRecord_100_62_16_111", "AlterRefRecord_100_63_16_112", 
    "AlterRefRecord_100_64_16_113", "AlterRefRecord_100_64_16_113", 
    "AlterRefRecord_100_65_16_114", "AlterRefRecord_100_6_16_54", 
    "AlterRefRecord_100_71_16_121", "AlterRefRecord_100_72_16_122", 
    "AlterRefRecord_100_74_16_124", "AlterRefRecord_100_78_16_128", 
    "AlterRefRecord_100_84_16_134", "AlterRefRecord_100_85_16_135", 
    "AlterRefRecord_100_86_16_136", "AlterRefRecord_100_87_16_137", 
    "AlterRefRecord_100_88_16_138", "AlterRefRecord_100_89_16_139", 
    "AlterRefRecord_100_8_16_56", "AlterRefRecord_100_90_16_140", 
    "AlterRefRecord_100_91_16_141", "AlterRefRecord_100_91_16_141", 
    "AlterRefRecord_100_92_16_142", "AlterRefRecord_100_93_16_143", 
    "AlterRefRecord_100_94_16_144", "AlterRefRecord_100_95_16_145", 
    "AlterRefRecord_100_97_16_147", "AlterRefRecord_100_99_16_149", 
    "AlterRefRecord_101_101_16_151", "AlterRefRecord_101_102_16_152", 
    "AlterRefRecord_101_103_16_153", "AlterRefRecord_101_105_16_155", 
    "AlterRefRecord_101_106_16_156", "AlterRefRecord_101_108_16_158", 
    "AlterRefRecord_101_10_16_58", "AlterRefRecord_101_115_16_165", 
    "AlterRefRecord_101_119_16_169", "AlterRefRecord_101_120_16_170", 
    "AlterRefRecord_101_120_16_170", "AlterRefRecord_101_121_16_171", 
    "AlterRefRecord_101_122_16_172", "AlterRefRecord_101_131_16_181", 
    "AlterRefRecord_101_136_16_186", "AlterRefRecord_101_139_16_189", 
    "AlterRefRecord_101_13_16_61", "AlterRefRecord_101_15_16_63", 
    "AlterRefRecord_101_15_16_63", "AlterRefRecord_101_17_16_65", 
    "AlterRefRecord_101_1_16_48", "AlterRefRecord_101_20_16_70", 
    "AlterRefRecord_101_23_16_72", "AlterRefRecord_101_27_16_76", 
    "AlterRefRecord_101_28_16_77", "AlterRefRecord_101_30_16_79", 
    "AlterRefRecord_101_31_16_80", "AlterRefRecord_101_32_16_81", 
    "AlterRefRecord_101_33_16_82", "AlterRefRecord_101_35_16_84")), .Names = c("NodeID", 
    "ClusterID"), row.names = c(NA, -112L), class = "data.frame")
    
    
        library(doParallel)
        library(foreach)
    
        SplitLists <- split(master_UniqueClusterIDs, sample(1:32, nrow(master_UniqueClusterIDs), replace=T))
    
        registerDoParallel(7)
        getDoParWorkers()  
    
        foreach(ggg = 1:32 ) %dopar% { 
          print(ggg)
          UniqueClusterIDs <- SplitLists[[ggg]][,1]
          BS_ClusterID <- 1 ### Start
          BS_ClusterTable <- data.frame()
    
          while(length(UniqueClusterIDs)>0){
    
              print(paste(ggg, "big", length(UniqueClusterIDs)))      
    
    
              NodeIDs_start <- WS_ClusterTable[WS_ClusterTable$ClusterID == UniqueClusterIDs[1],]$NodeID
    
    
              ######################################
              Continue <- 1
    
              # While number of clusters and the number of node_ids grows, continue searching
              while(Continue == 1 ) {
    
                ####### 
    
                ClusterIDs <- WS_ClusterTable[WS_ClusterTable$NodeID %in% NodeIDs_start[,1],]$ClusterID
                NodeIDs_2 <- WS_ClusterTable[WS_ClusterTable$ClusterID %in% ClusterIDs[,1],]$NodeID
    
    
                if(  nrow(NodeIDs_2) > nrow(NodeIDs_start)){
                  NodeIDs_start <- NodeIDs_2
                  Continue <- 1
                } else {
    
                  Continue <- 0
    
                  #### Now that you have reached the end of this cluster, Insert it and then remove 
    
                  BS_ClusterID <- BS_ClusterID+1
                  FinalID <- paste("BS_ClusterID",BS_ClusterID,ggg,sep="_")
                  NodeIDs_2 ## all the nodes
    
                  NewRows <- data.frame(NodeID= unique(NodeIDs_2), BS_ClusterID=FinalID) 
    
                 BS_ClusterTable <- rbind(BS_ClusterTable,NewRows)
    
                  ### Remove all listed clusters from the clusterlist
    
    
                  UniqueClusterIDs <- UniqueClusterIDs[! UniqueClusterIDs %in% ClusterIDs[,1]]
                }
    
              }
            }
    
        }
    

0 个答案:

没有答案