这是一个用于查找损坏链接的网络抓取工具。它使用一个队列对它找到的链接进行排队,并使用一个集合,因此它不会重新访问旧链接。 它运行良好的单线程,但不是当我尝试线程池。 你能帮我解决这个问题吗?
它打算向队列中添加新元组(link,link_parent),除非该集合中已存在该链接。它会将它解析的所有链接添加到该集合中。
import requests
from lxml import html
from bs4 import BeautifulSoup
import queue
import concurrent.futures
import time
def iter_q(q):
while not q.empty():
yield q.get()
def do_stuff(curr_website_tuple,already_checked,q):
curr_website_father = curr_website_tuple[1]
curr_website = curr_website_tuple[0]
already_checked.add(curr_website)
try:
r = requests.get(curr_website, timeout=10)
ret_status_code = r.status_code
if r.status_code is 200:
soup = BeautifulSoup(r.content, "html.parser")
for link in soup.find_all('a', href=True):
if (link['href'].startswith("http") and
"yahoo." in link['href'] and
".blogs.yahoo." not in link['href'] and
"doubleclick." not in link['href'] and
"adw.yahoo.com" not in link['href'] and
"google." not in link['href'] and
link['href'] not in already_checked):
q.put((link['href'],curr_website))
return curr_website + ' ' + curr_website_father + ' ' + str(r.status_code) + ' ' + "|Number checked:" + str(len(already_checked)) + ' ' + "|Queue size:" + str(q.qsize())
else:
return "Request_Error: " + ',' + curr_website + ',' + curr_website_father + ',' + str(r.status_code) + '\n'
except Exception as e:
return "Error: " + ',' + curr_website + ',' + curr_website_father + ',' + str(e) + '\n'
def with_threads():
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
q = queue.LifoQueue()
already_checked = set()
q.put(("http://www.yahoo.com",''))
q.put(("http://news.yahoo.com",''))
futures_dict = { executor.submit(do_stuff, qe, already_checked, q) : qe for qe in iter_q(q)}
for future in concurrent.futures.as_completed(futures_dict):
print(future.result())
with_threads()
答案 0 :(得分:1)
我认为问题可能是您在中使用executor 构造声明了yet_checked。尝试在外面宣布它,看看它是怎么回事。