Question

我在Python中运行一个程序，使用线程来并行化任务。任务是简单的字符串匹配，我将大量短字符串匹配到长字符串数据库。当我尝试并行化时，我决定将短字符串列表拆分为多个等于核心数的子列表，并在不同的核心上分别运行每个子列表。但是，当我在5或10个核心上运行任务时，它比仅在一个核心上慢两倍。这可能是什么原因，我怎么可能解决它？

编辑：我的代码可以在下面看到

import sys
import os
import csv
import re
import threading
from Queue import Queue
from time import sleep
from threading import Lock


q_in = Queue()
q_out = Queue()
lock = Lock()

def ceil(nu):
    if int(nu) == nu:
        return int(nu)
    else:
        return int(nu) + 1

def opencsv(csvv):
    with open(csvv) as csvfile:
        peptides = []
        reader = csv.DictReader(csvfile)
        k = 0
        lon = ""
        for row in reader:
            pept = str(row["Peptide"])
            pept = re.sub("\((\+\d+\.\d+)\)", "", pept)
            peptides.append(pept)
        return peptides

def openfasta(fast):
    with open(fast, "r") as fastafile:
        dic = {}
        for line in fastafile:
            l = line.strip()
            if l[0] == ">":
                cur = l
                dic[l] = ""
            else:
                dic[cur] = dic[cur] + l
        return dic

def match(text, pattern):
    text = list(text.upper())
    pattern = list(pattern.upper())
    ans = []
    cur = 0
    mis = 0
    i = 0
    while True:
        if i == len(text):
            break
        if text[i] != pattern[cur]:
            mis += 1
            if mis > 1:
                mis = 0
                cur = 0
                continue
        cur = cur + 1
        i = i + 1
        if cur == len(pattern):
            ans.append(i - len(pattern))
            cur = 0
            mis = 0
            continue
    return ans

def job(pepts, outfile, genes):
    c = 0
    it = 0
    towrite = []
    for i in pepts:
        # if it % 1000 == 0:
            # with lock:
                # print float(it) / float(len(pepts))
        it = it + 1
        found = 0
        for j in genes:
            m = match(genes[j], i)
            if len(m) > 0:
                found = 1
                remb = m[0]
                wh = j
                c = c + len(m)
                if c > 1:
                    found = 0
                    c = 0
                    break
        if found == 1:
            towrite.append("\t".join([i, str(remb), str(wh)]) + "\n")
    return towrite


def worker(outfile, genes):
    s = q_in.qsize()
    while True:
        item = q_in.get()
        print "\r{0:.2f}%".format(1 - float(q_in.qsize()) / float(s))
        if item is None:
            break #kill thread
        pepts = item
        q_out.put(job(pepts, outfile, genes))
        q_in.task_done()

def main(args):
    num_worker_threads = int(args[4])

    pept = opencsv(args[1])
    l = len(pept)
    howman = num_worker_threads
    ll = ceil(float(l) / float(howman * 100))
    remain = pept
    pepties = []
    while len(remain) > 0:
        pepties.append(remain[0:ll])
        remain = remain[ll:]
    for i in pepties:
        print len(i)
    print l

    print "Csv file loaded..."
    genes = openfasta(args[2])
    out = args[3]
    print "Fasta file loaded..."

    threads = []

    with open(out, "w") as outfile:
        for pepts in pepties:
            q_in.put(pepts)

        for i in range(num_worker_threads):
            t = threading.Thread(target=worker, args=(outfile, genes, ))
            # t.daemon = True
            t.start()
            threads.append(t)

        q_in.join() # run workers

        # stop workers
        for _ in range(num_worker_threads):
            q_in.put(None)
        for t in threads:
            t.join()
            # print(t)

    return 0
if __name__ == "__main__":
  sys.exit(main(sys.argv))

代码的重要部分是在工作函数中，其中pepts中的短序列与基因中的长序列匹配。

Answer 1

这应该是因为CPython中的GIL（全局解释器锁）。

在CPython中，全局解释器锁或GIL是一个互斥锁，它可以防止多个本机线程同时执行Python字节码。

2010年PyCon的David Beazley presentation给出了关于GIL的详细解释。从第32页到第34页，他解释了为什么相同的多线程代码（CPU绑定计算）在运行多个内核时比使用单核运行时性能更差。

（单核）线程交替执行，但切换远   不像你想象的那么频繁

使用多个核心，可运行的线程可以同时（在不同的核心上）进行调度，并通过GIL进行战斗

David this experiment result可视化＆＃34;随着CPU数量的增加，线程切换变得越来越快＃34;

即使您的job函数包含一些I / O，但根据其3级嵌套循环（job中的两个和match中的一个），它更像是CPU-约束计算。

将代码更改为多处理将有助于您利用多个内核并可以提高性能。然而，你可以获得多少取决于计算的数量 - 并行计算的好处是否可以远远超过多处理引入的开销，例如进程间通信。

在多个核心上运行程序

1 个答案: