所以我试图在同一时间在几个网站上使用requests
和BeautifulSoup
,由于某些原因我无法使其工作。
这是一个完整的例子:
import multiprocessing as mp
import requests
from bs4 import BeautifulSoup
from random import randint
# Define an output queue
class Spider(object):
"""docstring for Spider"""
def __init__(self):
super(Spider, self).__init__()
# define a example function
def rand_string(length, output):
random_post=randint(1000000,9999999)
response=requests.get('https://stackoverflow.com/questions/'+str(random_post))
soup=BeautifulSoup(response.content,'lxml')
try:
title=soup.find('a',{'class':'question-hyperlink'}).string
except:
title="not found"
output.put(title)
# Setup a list of processes that we want to run
def run(self):
output = mp.Queue()
processes = [mp.Process(target=Spider.rand_string, args=(x, output)) for x in range(10)]
for p in processes:
p.start()
# Exit the completed processes
for p in processes:
p.join()
# Get process results from the output queue
results = [output.get() for p in processes]
print(results)
# Run processes
if __name__ == '__main__':
spider=Spider()
spider.run()
答案 0 :(得分:1)
我添加了一堆调试打印语句来跟踪您的流程并得出一些结论......
fork()
的窗户是一个巨大的痛苦。您的主要错误位于rand_string()
,其中包含以下行:
title=soup.find('a',{'class':'question-hyperlink'}).string
这返回<class 'bs4.element.NavigableString'>
而不是<class str>
。当这被传递给mp.Queue.put()
时,尝试腌制它以便它可以通过内部管道发送失败并出现递归错误,从而使队列停滞。我不确定是否可以通过pickle管道发送bs4元素,(也许你将引用循环转换为weakrefs?)但是总是发送简单的python对象要容易得多。我还将队列的创建移动到主上下文(在spider.run()
之外),尽管这不是特别必要的,只要它只由主线程执行即可。这是我最后一次迭代中的调试代码,因此您可以按照我的测试方法进行操作:
from multiprocessing import Process, Queue, current_process
import requests
from bs4 import BeautifulSoup
from random import randint
import sys
#sys.setrecursionlimit(1000)
class Spider(object):
"""docstring for Spider"""
# define a example function
@staticmethod
def rand_string(length, output):
print("{} entry point".format(current_process().name))
random_post=randint(1000000,9999999)
response=requests.get('https://stackoverflow.com/questions/'+str(random_post))
print("{} got request response".format(current_process().name))
soup=BeautifulSoup(response.content,'lxml')
try:
title = soup.find('a',{'class':'question-hyperlink'}).string
except:
title = "not found"
print("{} got title: '{}' of type: {}".format(current_process().name, title, type(title)))
###### This did it ######
title = str(title) #fix or fake news?
output.put([title,current_process().name])
output.close()
print("{} exit point".format(current_process().name))
# Setup a list of processes that we want to run
# @staticmethod
def run(self, outq):
processes = []
for x in range(5):
processes.append(Process(target=self.rand_string, name="process_{}".format(x), args=(x, outq,),) )
print("creating process_{}".format(x))
for p in processes:
p.start()
print("{} started".format(p.name))
# Exit the completed processes
for p in processes:
p.join()
print("successuflly joined {}".format(p.name))
# Get process results from the output queue
print("joined all workers")
# return None
out = []
while not outq.empty():
result = outq.get()
print("got {}".format(result))
out.append(result)
return out
# Run processes
if __name__ == '__main__':
outq = Queue()
spider=Spider()
out = spider.run(outq)
print("done")
和运行所述代码的输出:
creating process_0 creating process_1 creating process_2 creating process_3 creating process_4 process_0 started process_1 started process_2 started process_3 started process_4 started process_2 entry point process_2 got request response process_2 got title: 'not found' of type: <class 'str'> process_2 exit point process_0 entry point process_0 got request response process_0 got title: 'Starting Activity when video is finished playing' of type: <class 'bs4.element.NavigableString'> process_0 exit point successuflly joined process_0 process_3 entry point process_3 got request response process_3 got title: 'Just don't understand the point of these typedefs' of type: <class 'bs4.element.NavigableString'> process_3 exit point process_1 entry point process_1 got request response process_1 got title: 'Import button + File browse field in admin product grid in magento' of type: <class 'bs4.element.NavigableString'> process_1 exit point process_4 entry point process_4 got request response process_4 got title: 'How can I do a query with subselect' of type: <class 'bs4.element.NavigableString'> process_4 exit point successuflly joined process_1 successuflly joined process_2 successuflly joined process_3 successuflly joined process_4 joined all workers got ['not found', 'process_2'] got ['Starting Activity when video is finished playing', 'process_0'] got ["Just don't understand the point of these typedefs", 'process_3'] got ['Import button + File browse field in admin product grid in magento', 'process_1'] got ['How can I do a query with subselect', 'process_4'] done