查看代码。当genMotifs的参数设置为n_seq = 5000且n_pos = 10时,并行版本getPairedSeqNames3和getPairedSeqNames1要慢得多。但是当n_seq = 50且n_pos = 2000时,并行版本的性能会更好。不幸的是,我正在处理的数据更像是n_seq = 5000和n_pos = 10。谁能告诉我为什么会这样呢?当n_seq = 5000和n_pos = 10时,有没有办法让并行版本表现更好?
以下是代码:
#! /usr/bin/env python
import pp, sys, random, time
def getMotif_SeqName(Motifs):
return dict([(uid, set(Motifs[uid].keys())) for uid in Motifs.keys()])
def getPairedList(uids):
return [(id1, id2) for i, id1 in enumerate(uids) for id2 in uids[i:] if id1 != id2]
def is_overlap(pos_pair):
(posA, posB) = pos_pair
if max(posA) < min(posB) or min(posA) > max(posB):
return False
else:
return True
def caclDist(pos_pair):
(posA, posB) = pos_pair
d1 = min(posB) - max(posA)
d2 = min(posA) - max(posB)
return {True: d1, False: -d2}[d1 > d2]
def getDist(posA, posB, low, high):
comb = [(i, j) for i in posA for j in posB]
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
CoDist = {}
for i, d in enumerate(distances):
if abs(d) >= low and abs(d) <= high:
CoDist[not_overlap[i]] = d
return CoDist
def getDist2(uidA, uidB, seqname, posA, posB, low, high):
comb = [(i, j) for i in posA for j in posB]
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
CoDist = {}
for i, d in enumerate(distances):
if abs(d) >= low and abs(d) <= high:
CoDist[not_overlap[i]] = d
return (uidA, uidB, seqname, CoDist)
def ppCacl(job_server, inputs, equation, funs, packages, Progress=True):
num_inputs = len(inputs) / 100 + 1
jobs = [job_server.submit(equation, pars, funs, packages) for pars in inputs]
return [job() for job in jobs]
def ssCacl(inputs, equation):
ps = []
for i, (X, n, m, N) in enumerate(inputs):
ps.append(equation(X, n, m, N))
return ps
def getPairedSeqNames1(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ppCacl(job_server, positions, getDist, (is_overlap, caclDist), (), False)
distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames2(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ssCacl(positions, getDist)
distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames3(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
positions = []
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions.extend([(uidA, uidB, seqname, PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect])
distances = ppCacl(job_server, positions, getDist2, (is_overlap, caclDist), (), False)
for (uidA, uidB, seqname, CoDist) in distances:
if CoDist:
if not PairedMotifs.has_key((uidA, uidB)):
PairedMotifs[(uidA, uidB)] = {}
PairedMotifs[(uidA, uidB)][seqname] = CoDist
return PairedMotifs
def genMotifs(n_seq=5000, n_pos=10):
digits = range(1, 60000)
Motifs = {}
uids = random.sample(digits, 50)
for uid in uids:
seqnames = random.sample(digits, random.randint(0, n_seq))
Motifs[uid] = {}
for seqname in seqnames:
Motifs[uid][seqname] = genPos(random.randint(0, n_pos))
return Motifs
def genPos(n):
return [(random.randint(0, 3000),random.randint(0, 3000)) for i in xrange(0,n)]
job_server = pp.Server()
Motifs = genMotifs()
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp
Motifs = genMotifs(50, 2000)
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp
我的电脑上的结果:
1225 pairs to go
57.377081871 16666 20431
1225 pairs to go
15.1005380154 16666 20431
1225 pairs to go
59.9019329548 16666 20431
1225 pairs to go
43.1178700924 11721 46015
1225 pairs to go
77.7199709415 11721 46015
1225 pairs to go
10.1687381268 11721 46015
getPairedSeqNames3的cProfile n_seq = 5000 n_pos = 10
getPairedSeqNames3的cProfile n_seq = 10 n_pos = 5000
getPairedSeqNames3的cProfile n_seq = 20 n_pos = 2500
答案 0 :(得分:0)
我改变了你的代码以使用更好的python习语:
#! /usr/bin/env python
import pp
import sys
import random
import time
from collections import defaultdict
job_server = pp.Server()
def getMotif_SeqName(Motifs):
return {uid: set(d.keys()) for uid, d in Motifs.items()}
def getPairedList(uids):
return [(id1, id2) for i, id1 in enumerate(uids) for id2 in uids[i:] if id1 != id2]
def is_overlap(pos_pair):
(posA, posB) = pos_pair
return not (max(posA) < min(posB) or min(posA) > max(posB))
def caclDist(pos_pair):
(posA, posB) = pos_pair
d1 = min(posB) - max(posA)
d2 = min(posA) - max(posB)
return d1 if d1 > d2 else -d2
def getDist(posA, posB, low, high):
comb = ((i, j) for i in posA for j in posB)
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
return {
not_over: d
for not_over, d in zip(not_overlap, distances)
if low <= abs(d) <= high
}
def getDist2(uidA, uidB, seqname, posA, posB, low, high):
return (uidA, uidB, seqname, getDist(posA, posB, low, high))
def ppCacl(job_server, inputs, equation, funs, packages, Progress=True):
jobs = (job_server.submit(equation, pars, funs, packages) for pars in inputs)
return [job() for job in jobs]
def ssCacl(inputs, equation):
return [equation(X, n, m, N) for (X, n, m, N) in inputs]
def getPairedSeqNames1(Motifs, SeqNames, MotifPairs):
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ppCacl(job_server, positions, getDist, (is_overlap, caclDist), (), False)
distances = {index: d for index, d in zip(intersect, distances) if d}
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames2(Motifs, SeqNames, MotifPairs):
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = ((PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect)
distances = ssCacl(positions, getDist)
distances = {index: d for index, d in zip(intersect, distances) if d}
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames3(Motifs, SeqNames, MotifPairs):
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = defaultdict(dict)
positions = []
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions.extend([(uidA, uidB, seqname, PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect])
distances = ppCacl(job_server, positions, getDist2, (is_overlap, caclDist), (), False)
for (uidA, uidB, seqname, CoDist) in distances:
if CoDist:
PairedMotifs[(uidA, uidB)][seqname] = CoDist
return PairedMotifs
def genMotifs(n_seq, n_pos):
digits = range(1, 60000)
uids = random.sample(digits, 50)
return {
uid: {
seqname: genPos(random.randint(0, n_pos))
for seqname in random.sample(digits, random.randint(0, n_seq))
}
for uid in uids
}
def genPos(n):
return [(random.randint(0, 3000), random.randint(0, 3000)) for _ in xrange(n)]
def driver(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
for fn in (getPairedSeqNames1, getPairedSeqNames2, getPairedSeqNames3):
timestamp = time.time()
fn(Motifs, SeqNames, MotifPairs)
print time.time() - timestamp
if __name__ == '__main__':
for x, y in ((5000, 10), (50, 2000)):
print '=' * 30
driver(genMotifs(x, y))
我不能保证你会更快。如果您希望优化代码,我可以使用cProfile或使用numpy或cython来查看分析。