Question

我尝试使用多处理程序包使Python中的for循环并行化。我想在多个线程上运行的功能就是这个

def regression_loss(W, k, x, y):
    U = expit(x.dot(W[k].T)) - y
    return np.sum(U*U)

计算多个数据点x的多类回归问题的误差。 W是权重矩阵，y是目标。

目前，并行的for循环大约慢200倍。我想知道为什么和如何使并行for循环比seriell for循环快得多。

这是我的代码，其中我比较了使用多处理模块的标准for循环和并行化的循环。

import time
import numpy as np
import multiprocessing as mp
from scipy.special import expit

def regression_loss(W, k, x, y):
    U = expit(x.dot(W[k].T)) - y
    return np.sum(U*U)

def optimizer_seriell(p_size, n_classes, n_input, batch_size, W):
    loss = np.zeros((p_size))
    x, y = np.random.rand(batch_size, n_input), np.random.rand(batch_size, n_classes) 
    for k in range(p_size):
        loss[k] = regression_loss(W, k, x, y)

def optimizer_parallel(p_size, n_classes, n_input, batch_size, W):
    pool = mp.Pool(processes = 4)
    x, y = np.random.rand(batch_size, n_input), np.random.rand(batch_size, n_classes) 
    loss = [pool.apply(regression_loss, args=(W, k, x, y)) for k in range(p_size)]

if __name__ == "__main__":
    p_size = 32
    n_classes = 10
    n_input = 1000
    batch_size = 8
    W = [np.random.rand(n_classes, n_input) for k in range(p_size)]

    t0 = time.time()
    optimizer_seriell(p_size, n_classes, n_input, batch_size, W)
    print(time.time()-t0) # 0.00186 on my machine

    t0 = time.time()
    optimizer_parallel(p_size, n_classes, n_input, batch_size, W)
    print(time.time()-t0) # 0.20029 on my machine

Answer 1

我做了一些自己的测试，这是结果。我认为您希望使用pool.map而不是应用，特别是如果您要保持秩序。

import time
import numpy as np
import multiprocessing as mp
from scipy.special import expit

def regression_loss(test):#a little lazy to work out how to pass multiple variables
    W = test[0]
    k = test[1]
    x = test[2]
    y = test[3]
    U = expit(x.dot(W[k].T)) - y
    time.sleep(1) #simulate lots of hard work
    return np.sum(U*U)

def optimizer_seriell(x,y,p_size, n_classes, n_input, batch_size, W):
    loss = np.zeros((p_size))

    for k in range(p_size):
        loss[k] = regression_loss((W, k, x, y))



def optimizer_parallel(x,y,p_size, n_classes, n_input, batch_size, W):
    with mp.Pool(processes = 4) as pool:

        loss = [pool.map(regression_loss,[(W,k,x,y) for k in range(p_size)])]

if __name__ == "__main__":
    p_size = 32
    n_classes = 10
    n_input = 1000
    batch_size = 8
    W = [np.random.rand(n_classes, n_input) for k in range(p_size)]
    x, y = np.random.rand(batch_size, n_input), np.random.rand(batch_size, n_classes) 
    t0 = time.time()
    optimizer_seriell(x,y,p_size, n_classes, n_input, batch_size, W)
    print(time.time()-t0) 

    t0 = time.time()
    optimizer_parallel(x,y,p_size, n_classes, n_input, batch_size, W)
    print(time.time()-t0)

这导致：串行的32.018938064575195 9.142435073852539用于并行处理（如果每个循环需要1秒来处理，则明显节省时间）

当我像您一样尝试应用时，根本没有节省时间。这是因为即使在可迭代过程中，apply也会阻塞。即实际上没有并行处理发生。

您要显示的内容实际上并不是多处理的问题，但是间接费用使收益蒙上了阴影。

编辑：全部适用连续-> 32.02秒并行-> 34.04＃实际没有并行处理

为什么我的并行for循环这么慢？

1 个答案: