我已经在使用Python的multiprocessing
功能上研究了一个小时以上,尝试使用multiprocessing.Process
和multiprocessing.Manager
来并行化一个相当复杂的图遍历函数:
import networkx as nx
import csv
import time
from operator import itemgetter
import os
import multiprocessing as mp
cutoff = 1
exclusionlist = ["cpd:C00024"]
DG = nx.read_gml("KeggComplete.gml", relabel=True)
for exclusion in exclusionlist:
DG.remove_node(exclusion)
# checks if 'memorizedPaths exists, and if not, creates it
fn = os.path.join(os.path.dirname(__file__),
'memorizedPaths' + str(cutoff+1))
if not os.path.exists(fn):
os.makedirs(fn)
manager = mp.Manager()
memorizedPaths = manager.dict()
filepaths = manager.dict()
degreelist = sorted(DG.degree_iter(),
key=itemgetter(1),
reverse=True)
def _all_simple_paths_graph(item, DG, cutoff, memorizedPaths, filepaths):
source = item[0]
uniqueTreePaths = []
if cutoff < 1:
return
visited = [source]
stack = [iter(DG[source])]
while stack:
children = stack[-1]
child = next(children, None)
if child is None:
stack.pop()
visited.pop()
elif child in memorizedPaths:
for path in memorizedPaths[child]:
newPath = (tuple(visited) + tuple(path))
if (len(newPath) <= cutoff) and
(len(set(visited) & set(path)) == 0):
uniqueTreePaths.append(newPath)
continue
elif len(visited) < cutoff:
if child not in visited:
visited.append(child)
stack.append(iter(DG[child]))
if visited not in uniqueTreePaths:
uniqueTreePaths.append(tuple(visited))
else: # len(visited) == cutoff:
if (visited not in uniqueTreePaths) and
(child not in visited):
uniqueTreePaths.append(tuple(visited + [child]))
stack.pop()
visited.pop()
# writes the absolute path of the node path file into the hash table
filepaths[source] = str(fn) + "/" + str(source) + "path.txt"
with open (filepaths[source], "wb") as csvfile2:
writer = csv.writer(csvfile2, delimiter=" ", quotechar="|")
for path in uniqueTreePaths:
writer.writerow(path)
memorizedPaths[source] = uniqueTreePaths
############################################################################
if __name__ == '__main__':
start = time.clock()
for item in degreelist:
test = mp.Process(target=_all_simple_paths_graph,
args=(DG, cutoff, item, memorizedPaths, filepaths))
test.start()
test.join()
end = time.clock()
print (end-start)
目前,尽管是通过运气和魔法,它(某种程度上)起作用了。我的问题是,我只使用了24个核心中的12个。
有人能解释一下这可能是什么原因吗?也许我的代码不是最佳的多进程解决方案,或者这是我的架构Intel Xeon CPU E5-2640 @ 2.50GHz x18 running on Ubuntu 13.04 x64的一个特性?
编辑:
我设法获得:
p = mp.Pool()
for item in degreelist:
p.apply_async(_all_simple_paths_graph,
args=(DG, cutoff, item, memorizedPaths, filepaths))
p.close()
p.join()
工作起来非常缓慢!因此我认为我正在使用错误的函数。希望这可以澄清我要完成的任务!
编辑2:.map
尝试:
partialfunc = partial(_all_simple_paths_graph,
DG=DG,
cutoff=cutoff,
memorizedPaths=memorizedPaths,
filepaths=filepaths)
p = mp.Pool()
for item in processList:
processVar = p.map(partialfunc, xrange(len(processList)))
p.close()
p.join()
工作正常,比单核慢。需要优化时间!
test.join()
)后循环才开始另一个进程,这看起来很奇怪。尝试在加入任何一个进程之前至少启动24个进程。 - Tim PetersR
中的mclapply
? - Darkstarone