我正在尝试实现一个多线程抓取工具,它使用初始网址并搜索该链接中的链接并显示每个链接,同时在每个链接中查找链接
这是我的代码
import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
class a3_6:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls = []
def __init__(self, start_url, max_threads):
self.__url_q.put(start_url)
self.max_threads = max_threads
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
except urllib.error.URLError as e:
print(e.reason)
except:
print("invalid: " + url)
self.__visited_urls.append(url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
if a not in self.__visited_urls:
link='https://en.wikipedia.org'+a.get('href')
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
print (self.__data_q.get())
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
else:
break
def run(self):
self.download_thread()
self.mine_thread()
self.store()
def op(self):
for x in range(self.max_threads):
t = threading.Thread(target=self.run)
t.daemon = True
t.start()
self.store()
if __name__ == '__main__':
a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
a.op()
编辑:我编辑了代码,现在我得到了正确的结果但又没有结束。
答案 0 :(得分:0)
我到达了解决方案。我接受了詹姆斯哈里森的帮助。我不知道为什么他删除了他的原始解决方案,但这里是
import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db
class a3_5:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls=[]
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
pars=urlparse(url)
except urllib.error.URLError as e:
print(e.reason+':'+url)
except:
print("invalid: " + url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
link=a.get('href')
"""if not link.startswith('www'):
link=self.__prfx+link"""
if link not in self.__visited_urls:
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
cont=self.__data_q.get()
print (cont)
else:
break
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
self.__url_q.task_done()
def op(self,*urls):
for x in range(25):
d = threading.Thread(target=self.download_thread)
d.setDaemon(True)
d.start()
for url in urls:
self.__url_q.put(url)
self.__url_q.join()
self.mine_thread()
self.store()
if __name__ == '__main__':
urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
a=a3_5()
a.op(*urls)
基本上我必须安排另一个队列,我必须设置工作人员来激活线程。此外,在download_thread方法完成后需要启动mine_thread和store方法,因为这些值不会被存储。