大家好我是python的新手。我想要做的是将我的旧代码移动到多处理中,但是我面临一些错误,我希望任何人都可以帮助我。我的代码用于检查文本形式中给出的几千个链接以检查某些标记。一旦找到它就会输出给我。由于我有几千个链接要检查的原因,速度是一个问题,因此我需要转向多处理。
更新:我有HTTP 503错误的返回错误。我发送了太多的请求,还是我想念痛风?
多处理代码:
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
no_stock = []
def main(lines):
done = False
tries = 1
while tries and not done:
try:
r = br.open(lines, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except Exception as e:
print '%s: %s' % (e.__class__.__name__, e)
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(lines))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(lines)
if __name__ == "__main__":
r = br.open('http://www.randomweb.com/') #avoid redirection
fileName = "url.txt"
pool = Pool(processes=2)
with open(fileName, "r+") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
回溯:
Traceback (most recent call last):
File "test2.py", line 43, in <module>
lines = pool.map(main, f)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 567, in get
raise self._value
UnboundLocalError: local variable 'soup' referenced before assignment
我的txt文件是这样的: -
http://www.randomweb.com/item.htm?uuid=44733096229
http://www.randomweb.com/item.htm?uuid=4473309622789
http://www.randomweb.com/item.htm?uuid=447330962291
....etc
答案 0 :(得分:-1)
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
no_stock = []
def main(line):
done = False
tries = 3
while tries and not done:
try:
r = br.open(line, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except:
print('Random fail retrying')
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(i))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(i)
if __name__ == "__main__":
fileName = "url.txt"
pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads.
with open(fileName, "rb") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
pool.map有两个参数,第一个是函数(在你的代码中,是main),另一个是可迭代的,每个iterable项都是函数的一个参数(在你的代码中,每一行都是文件)