我最近一直在尝试使用python进行网页内容抓取。我设法为抓取工具提供种子,从中开始抓取每个网站的标题,正文内容以及指向另一个网页的每个链接。
现在它确保每个链接都是唯一的,它插入到数据库中。这使它非常慢,但它是必须具备的功能 - 我认为。拥有数百个副本是没有意义的。
我想知道在~100秒内抓取页面是否真的应该是这个缓慢的过程,如果是这样,我怎么可能让它更快。我对其背后的理论非常感兴趣。
另外,我会提供我的代码,万一有人有兴趣深入研究它。
import requests as req
from pymongo import MongoClient
from bs4 import BeautifulSoup
import re
from time import time
from urllib.parse import urlsplit
client = MongoClient("ds055980.mongolab.com", 55980)
db = client.crawler
db.authenticate("swenn", "password")
global duplicates, new_links
duplicates, new_links = 0, 0
time_list = []
def get_data(soup):
for script in soup(["script", "style"]):
script.extract()
if soup.title == None:
return False
else:
title = soup.title.string
content = soup.getText(separator=u' ')
if len(content) < 15:
return False
content = content.replace("\n", "")
title = title.replace("\n", "")
rge = re.compile(r'\s+')
content = rge.sub(" ", content)
return content, title
def insert_data_into_db(soup, url):
data = get_data(soup)
if data == False:
db.links.remove({"_id": url[0]})
db.blacklist.insert({"address": url[1]})
return False
db.data.insert({"address": url[1], "title": data[1], "content": data[0], "time": round(time()), "in_use": 0})
def insert_urls_into_db(soup, current_url):
global duplicates, new_links
new_links = 0
regex = re.compile(r'#.+')
link_list = list(set(soup.find_all('a', href = re.compile('.+'))))
for link in link_list:
url = link.get('href')
if "{" in url or "}" in url or "javascript:" in url or "mailto:" in url or url == "#" or url == "":
continue
if "://" in url:
pass
elif "//" == url[0::2]:
url = "http:" + url
else:
parsed_current = urlsplit(current_url[1])
if "/" in url[0]:
url = parsed_current.scheme+"://"+parsed_current.netloc+url
elif "?" in url[0]:
url = parsed_current.scheme+"://"+parsed_current.netloc+parsed_current.path+url
else:
url_sub = current_url[1][::-1]
url = url_sub[url_sub.index("/")::][::-1] + url
if "#" in url:
url = regex.sub("", url)
if db.links.find({"address": url}).count() == 0:
db.links.insert({"address": url, "time": 1, "in_use": 0})
new_links += 1
else:
duplicates += 1
db.links.update({"_id": current_url[0]}, {"$set": {"in_use": 0, "time": round(time())}})
def save_state_and_exit(urls):
print("Saving document state...")
for url in urls:
db.links.update({"_id": url[0]}, {"$set": {"in_use": 0, "time": 1}})
db.data.remove({"address": url[1]})
print("Exiting...")
exit()
def main():
while True:
urls = []
try:
documents = db.links.find({"time": {"$lt": round(time()) - 2592000}, "in_use": 0}).limit(10)
if documents == None:
print("Query did not match any documents. Exiting...")
break
for document in documents:
db.links.update({"_id": document["_id"]}, {"$set": {"in_use": 1}})
urls.append((document["_id"], document["address"]))
t = round(time())
for url in urls:
print("current URL:", url[1])
try:
html = req.get(url[1], timeout=5)
if html.encoding != 'utf-8':
html.encoding = 'utf-8'
html = html.text
except (req.exceptions.Timeout, req.exceptions.ConnectionError):
print("URL",url,"doesn\'t respond. Deleting...")
db.links.remove({"_id": url[0]})
if db.blacklist.find({"address": url[1]}).count() == 0:
db.blacklist.insert({"address": url[1]})
continue
soup = BeautifulSoup(html)
if insert_data_into_db(soup, url) == False:
continue
insert_urls_into_db(soup, url)
print("vottis aega:", round(time()) - t,"sekundit","\t","uusi linke:","\t","duplikaate:", duplicates,"\n\n")
except (KeyboardInterrupt, SystemExit):
save_state_and_exit(urls)
if __name__ == "__main__":
main()
答案 0 :(得分:0)
你在使用&#34; DOWNLOAD_DELAY&#34;在爬虫的设置中,设置下载延迟使蜘蛛更快,但确保网站允许你,否则可能会很快被你的网站阻止。
DOWNLOAD_DELAY = 0 //as much as it is possible