用python有效地刮网

时间:2014-12-17 17:04:54

标签: python performance web-scraping

我最近一直在尝试使用python进行网页内容抓取。我设法为抓取工具提供种子,从中开始抓取每个网站的标题,正文内容以及指向另一个网页的每个链接。

现在它确保每个链接都是唯一的,它插入到数据库中。这使它非常慢,但它是必须具备的功能 - 我认为。拥有数百个副本是没有意义的。

我想知道在~100秒内抓取页面是否真的应该是这个缓慢的过程,如果是这样,我怎么可能让它更快。我对其背后的理论非常感兴趣。

另外,我会提供我的代码,万一有人有兴趣深入研究它。

import requests as req
from pymongo import MongoClient
from bs4 import BeautifulSoup
import re
from time import time
from urllib.parse import urlsplit

client = MongoClient("ds055980.mongolab.com", 55980)
db = client.crawler
db.authenticate("swenn", "password")

global duplicates, new_links
duplicates, new_links = 0, 0
time_list = []

def get_data(soup):
    for script in soup(["script", "style"]):
        script.extract()

    if soup.title == None:
        return False
    else:
        title = soup.title.string

    content = soup.getText(separator=u' ')
    if len(content) < 15:
        return False

    content = content.replace("\n", "")
    title = title.replace("\n", "")
    rge = re.compile(r'\s+')
    content = rge.sub(" ", content)

    return content, title


def insert_data_into_db(soup, url):
    data = get_data(soup)
    if data == False:
        db.links.remove({"_id": url[0]})
        db.blacklist.insert({"address": url[1]})
        return False

    db.data.insert({"address": url[1], "title": data[1], "content": data[0], "time": round(time()), "in_use": 0})


def insert_urls_into_db(soup, current_url):
    global duplicates, new_links
    new_links = 0
    regex = re.compile(r'#.+')
    link_list = list(set(soup.find_all('a', href = re.compile('.+'))))

    for link in link_list:
        url = link.get('href')

        if "{" in url or "}" in url or "javascript:" in url or "mailto:" in url or url == "#" or url == "":
            continue

        if "://" in url:
            pass
        elif "//" == url[0::2]:
            url = "http:" + url
        else:
            parsed_current = urlsplit(current_url[1])
            if "/" in url[0]:
                url = parsed_current.scheme+"://"+parsed_current.netloc+url
            elif "?" in url[0]:
                url = parsed_current.scheme+"://"+parsed_current.netloc+parsed_current.path+url
            else:
                url_sub = current_url[1][::-1]
                url = url_sub[url_sub.index("/")::][::-1] + url

        if "#" in url:
            url = regex.sub("", url)

        if db.links.find({"address": url}).count() == 0:
            db.links.insert({"address": url, "time": 1, "in_use": 0})
            new_links += 1
        else:
            duplicates += 1

    db.links.update({"_id": current_url[0]}, {"$set": {"in_use": 0, "time": round(time())}})

def save_state_and_exit(urls):
    print("Saving document state...")
    for url in urls:
        db.links.update({"_id": url[0]}, {"$set": {"in_use": 0, "time": 1}})
        db.data.remove({"address": url[1]})
    print("Exiting...")
    exit()

def main():
    while True:
        urls = []
        try:
            documents = db.links.find({"time": {"$lt": round(time()) - 2592000}, "in_use": 0}).limit(10)

            if documents == None:
                print("Query did not match any documents. Exiting...")
                break

            for document in documents:
                db.links.update({"_id": document["_id"]}, {"$set": {"in_use": 1}})
                urls.append((document["_id"], document["address"]))

            t = round(time())

            for url in urls:
                print("current URL:", url[1])
                try:
                    html = req.get(url[1], timeout=5)
                    if html.encoding != 'utf-8':
                        html.encoding = 'utf-8'
                    html = html.text
                except (req.exceptions.Timeout, req.exceptions.ConnectionError):
                    print("URL",url,"doesn\'t respond. Deleting...")
                    db.links.remove({"_id": url[0]})
                    if db.blacklist.find({"address": url[1]}).count() == 0:
                        db.blacklist.insert({"address": url[1]})
                    continue

                soup = BeautifulSoup(html)
                if insert_data_into_db(soup, url) == False:
                    continue

                insert_urls_into_db(soup, url)

            print("vottis aega:", round(time()) - t,"sekundit","\t","uusi linke:","\t","duplikaate:", duplicates,"\n\n")
        except (KeyboardInterrupt, SystemExit):
            save_state_and_exit(urls)

if __name__ == "__main__":
    main()

1 个答案:

答案 0 :(得分:0)

你在使用&#34; DOWNLOAD_DELAY&#34;在爬虫的设置中,设置下载延迟使蜘蛛更快,但确保网站允许你,否则可能会很快被你的网站阻止。

    DOWNLOAD_DELAY = 0 //as much as it is possible