Python多处理从多个进程合并字典的字典

时间:2017-07-05 19:03:44

标签: python dictionary multiprocessing shared-memory shared

我试图在多个进程中使用共享内存来更新包含字典的字典...我尝试使用多处理模块中的管理器,但是我很难在其中添加字典。请参阅下面的代码和评论。本质上,这段代码应该在另一个名为" output的字典中创建输入的副本。"一旦我开始工作,就会有逻辑来复制某些"刀片"来自输入,但必须维护节点/集群/刀片层次结构。

from multiprocessing import Process, Lock, Manager

# Define dictionary that will be used in this example
input = {
    "Node_1": {
        "IP": "127.0.0.1", 
        "clusters": {
            "cluster_1": {
                "blades": {
                    "blade_0_1": "127.0.1.1", 
                    "blade_0_2": "127.0.1.2"
                }
            }, 
            "cluster_2": {
                "blades": {
                    "blade_0_3": "127.0.1.3", 
                    "blade_0_4": "127.0.1.4"
                }
            }
        }
    }, 
    "Node_2": {
        "IP": "127.0.0.2", 
        "clusters": {
            "cluster_1": {
                "blades": {
                    "blade_0_1": "127.0.1.1", 
                    "blade_0_2": "127.0.1.2"
                }
            }, 
            "cluster_2": {
                "blades": {
                    "blade_0_3": "127.0.1.3", 
                    "blade_0_4": "127.0.1.4"
                }
            }
        }
    }
}

def iterate_over_clusters_in_node(input, node, lock, output):
    """ Iterate over the clusters in the node, then over the blades in the cluster.  Add each blade to the output dictionary."""
    for cluster in input[node]['clusters']:
        for blade in input[node]['clusters'][cluster]['blades']:
            with lock:
                print "node: " + node + ", node_IP: " + input[node]['IP'] + ", cluster: " + cluster + ", Blade: " + blade + ", cluster_IP: " + input[node]['clusters'][cluster]['blades'][blade]
            with lock:
                add_blade_to_output(input, node, cluster, blade, output)

def add_blade_to_output(input, node, cluster, blade, output):
    ''' Adds a blade to the managed output dictionary'''
    if node not in output:
        output[node] = {}
        output[node]['IP'] = input[node]['IP']
        output[node]['clusters'] = {}
    # At this point, I would expect output[node]['IP'] and output[node]['clusters'] to exist
    # But the following print raises KeyError: 'IP'
    print output[node]['IP']
    if cluster not in output[node]['clusters']:
        # Raises KeyError: 'clusters'
        output[node]['clusters'][cluster] = {}
        output[node]['clusters'][cluster]['blades'] = {}
    output[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]

if __name__ == "__main__":
    # Create lock to ensure correct handling of output from multiple processes
    lock = Lock()

    # Create dictionary to hold any failed blades so that appropriate measures can be taken
    # Must use a Manager so that the dictionary can be shared among processes
    manager = Manager()
    output = manager.dict()

    # Create array to track our processes
    procs = []

    # Iterate over all nodes in input
    for node in input: 
        p = Process(target = iterate_over_clusters_in_node, args = (input, node, lock, output))
        p.start()
        procs.append(p)

    # Join processes and wait for them to stop
    for p in procs:
        p.join()

    print "The final output is:"
    print output
    # Expectation: should print the same dictionary as the input
    # Actual: prints "{'Node_2': {}, 'Node_1': {}}"

我是否需要将manager.dict()添加到输出[node]而不是内置字典类型?或者我是不是错了?

谢谢!

编辑:我不反对将其转换为"线程"实现而不是"多处理。"我是并行运行的新手,所以如果线程更适合这种类型的内存共享,请告诉我。

编辑:这是工作代码:

from multiprocessing import Process, Lock, Manager

# Define dictionary that will be used in this example
input = {
    "Node_1": {
        "IP": "127.0.0.1", 
        "clusters": {
            "cluster_1": {
                "blades": {
                    "blade_0_1": "127.0.1.1", 
                    "blade_0_2": "127.0.1.2"
                }
            }, 
            "cluster_2": {
                "blades": {
                    "blade_0_3": "127.0.1.3", 
                    "blade_0_4": "127.0.1.4"
                }
            }
        }
    }, 
    "Node_2": {
        "IP": "127.0.0.2", 
        "clusters": {
            "cluster_1": {
                "blades": {
                    "blade_0_1": "127.0.1.1", 
                    "blade_0_2": "127.0.1.2"
                }
            }, 
            "cluster_2": {
                "blades": {
                    "blade_0_3": "127.0.1.3", 
                    "blade_0_4": "127.0.1.4"
                }
            }
        }
    }
}

# Create dictionary to hold any failed blades so that appropriate measures can be taken
# Must use a Manager so that the dictionary can be shared among processes
manager = Manager()
output = manager.dict()

def iterate_over_clusters_in_node(input, node, lock):
    """ Iterate over the clusters in the node, then over the blades in the cluster.  Add each blade to the output dictionary."""
    for cluster in input[node]['clusters']:
        for blade in input[node]['clusters'][cluster]['blades']:
            with lock:
                add_blade_to_output(input, node, cluster, blade)

def add_blade_to_output(input, node, cluster, blade):
    ''' Adds a blade to the managed output dictionary'''
    if node not in output:
        new_node = {}
        new_node[node] = {'IP': input[node]['IP'], 'clusters': {}}
        output.update(new_node)
    new_node = {}
    new_node.update(output)
    if cluster not in output[node]['clusters']:
        new_node[node]['clusters'][cluster] = {}
        new_node[node]['clusters'][cluster]['blades'] = {blade: input[node]['clusters'][cluster]['blades'][blade]}
    else:
        new_node[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]
    output.update(new_node)

if __name__ == "__main__":
    # Create lock to ensure correct handling of output from multiple processes
    lock = Lock()

    # Create array to track our processes
    procs = []

    # Iterate over all nodes in input
    for node in input: 
        p = Process(target = iterate_over_clusters_in_node, args = (input, node, lock))
        p.start()
        procs.append(p)

    # Join processes and wait for them to stop
    for p in procs:
        p.join()

    print "The final output is:"
    print output

1 个答案:

答案 0 :(得分:1)

根据python文档,

  

对dict和列表代理中的可变值或项的修改将会   不能通过管理器传播,因为代理无法通过   知道何时修改其值或项目。要修改这样的项目,   您可以将修改后的对象重新分配给容器代理。

有了这些信息,我们可以相应地更新经理:

#output[node] = {}
#output[node]['IP'] = input[node]['IP']
#output[node]['clusters'] = {} These changes are not propagated through the manager

new_node = {}
new_node[node] = {'IP': input[node]['IP'], 'clusters': {}}
output.update(new_node)



#if cluster not in output[node]['clusters']:
    # Raises KeyError: 'clusters'
    #output[node]['clusters'][cluster] = {}
    #output[node]['clusters'][cluster]['blades'] = {}
    #output[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]


node_copy = output.copy()
if cluster not in node_copy[node]['clusters']:
    node_copy[node]['clusters'].setdefault(cluster, {'blades': {}})
node_copy[node]['clusters'][cluster]['blades'][blade] = input[node]['clusters'][cluster]['blades'][blade]
output.update(node_copy)

参考