在python中对webscraper进行多线程处理。线程库

时间:2020-05-08 06:48:44

标签: python python-3.x multithreading web-scraping python-multithreading

我正在尝试通过使用线程库实现多线程来加快我的webscraper的速度,但是运行时并不比使用单线程时快。我已尝试设置抓取器,以便您可以输入要刮取的页面数和/或离要进入的原始种子的跳数。

import sys
import queue
import requests
import time
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
import matplotlib.pyplot as plt
import threading
import statistics
import concurrent.futures

runtimes = []


def crawler(frontier, req_pages, req_hops, output_directory):
    f = open("report.txt", "w")
    f.write("")
    f.close()

    #multithreaded#
    with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
        executor.submit(crawler_thread, frontier, req_pages, req_hops, output_directory)

    #single threaded#
    # crawler_thread(frontier, req_pages, req_hops, output_directory)



def crawler_thread(frontier, req_pages, req_hops, output_directory):

    while not frontier.empty() and req_pages > 0  and req_hops > 0:

        start_time0 = time.perf_counter()

        try:

            url = frontier.get()
            #print("Trying to Crawl: ", url)
            if url == '*': # denotes end of current hop has been hit
                req_hops = req_hops - 1
                frontier.put('*') # mark end of next hop
            elif crawl_permitted(url):
                parsed_url = urlparse(url)
                filename = parsed_url.netloc[4:] if parsed_url.netloc.find('www') == 0 else parsed_url.netloc
                filename = filename + parsed_url.path.replace('/', '-')

                if not already_saved(filename, output_directory):
                    response = requests.get(url, verify = False)
                    time.sleep(3)  #sleep for a few seconds for politeness. To do : implement politeness windows based on the host's crawl_delay specified robots.txt
                    open(output_directory + '/' + filename, 'w', encoding='utf-8').write(response.text)     
                    req_pages = req_pages -1

                    soup = BeautifulSoup(response.text, 'html.parser')  
                    for link in soup.find_all('a'):

                        cleaned_link = clean_link(hyperlink = link.get('href'), website = url) 
                        if cleaned_link: 
                            frontier.put(cleaned_link)


        except:
            f = open("report.txt", "a")
            f.write("crawler failed on url: " + url) + "\n"
            f.close()

        end_time0 = time.perf_counter()
        runtimes.append(end_time0-start_time0)
        # refresh pages, robots.txt, politeness window - to do

#create url queue (frontier) parse argv to pass into crawler 
frontier = seeds_to_frontier(seeds_filename = sys.argv[1])
req_pages = int(sys.argv[2])
req_hops = int(sys.argv[3])
output_directory = sys.argv[4]

start_time1 = time.perf_counter()

#crawl
crawler(frontier, req_pages, req_hops, output_directory)

end_time1 = time.perf_counter()

f = open("report.txt", "a")
f.write("Total Scrape Time: " +  str(end_time1-start_time1) + "\n")
f.write("Average Scrape Time for Each Url: " + str(statistics.mean(runtimes)) + "\n")  
f.close()

plt.plot(runtimes)
plt.ylabel("execution time (s)")
plt.xlabel("url queue iteration")
plt.title("scrapetime for each url")
plt.savefig("execution_time_of_scraped_urls")

如果您想查看它们,这些是我在上面的代码中使用的其他功能(但我不认为它们是造成我的错误的原因):

def seeds_to_frontier(seeds_filename):
    seeds_file = open(seeds_filename, 'r') 

    frontier = queue.Queue()
    for page in seeds_file:
        frontier.put(page.strip('\n'))
    frontier.put('*') # marks end of hop 0

    seeds_file.close()
    return frontier

def crawl_permitted(url):
    rp = RobotFileParser()

    parsed_url = urlparse(url)
    filename = parsed_url.netloc.rsplit('.', 2)[-2] + '.' + parsed_url.netloc.rsplit('.', 2)[-1]
    robots_url = parsed_url.scheme + '://' + filename + '/robots.txt'

    if not already_saved(filename, 'robots'):
        response = requests.get(robots_url)
        open('robots/' + filename, 'w', encoding='utf-8').write(response.text)

    rp.set_url(robots_url)
    rp.read()   
    return rp.can_fetch('*', url)


def url_delay(url):
    rp = RobotFileParser()

    #set up url for robotfileparser
    parsed_url = urlparse(url)
    filename = parsed_url.netloc.rsplit('.', 2)[-2] + '.' + parsed_url.netloc.rsplit('.', 2)[-1]
    robots_url = parsed_url.scheme + '://' + filename + '/robots.txt'

    #parse the robots.txt and extract crawl_delay
    rp.set_url(robots_url)
    rp.read()
    crawl_delay = rp.crawl_delay("*") if rp.crawl_delay("*") else 0
    return crawl_delay


def already_saved(file, directory):
    for root, dirs, files in os.walk(directory):
        if file in files:
            return 1
    return 0

def clean_link(hyperlink, website): 
    parsed_website = urlparse(website)
    parsed_hyperlink = urlparse(hyperlink)
    cleaned_link = hyperlink

    if not parsed_hyperlink.scheme and not parsed_hyperlink.netloc and not parsed_hyperlink.path and not parsed_hyperlink.params \
        and not parsed_hyperlink.query and not parsed_hyperlink.fragment:
        return ''

    if parsed_hyperlink.scheme and parsed_hyperlink.scheme != 'http':
        return '' 

    if parsed_hyperlink.path == '/': # duplicate - self references
        return ''

    if parsed_hyperlink.fragment: # duplicate - bookmarks
        return ''

    # avoid pdfs, images,  # to do: The below solution is not the best to avoid all media
    if len(parsed_hyperlink.path) > 4 and (parsed_hyperlink.path[-4:] in [".pdf", ".jpg", ".png", ".svg", ".jpeg"]): 
        return ''

    # take care of invalid characters - to do

    if not (".edu" in parsed_hyperlink.netloc):# only crawl edu pages
        return ''

    if not parsed_hyperlink.netloc: 
        if parsed_hyperlink.path[0] == '/':
            cleaned_link = parsed_website.scheme + '://' + parsed_website.netloc + hyperlink
        else:
            # bug - hyperlink 'smi.ucr.edu' parsed as path and not netloc - ???
            if parsed_hyperlink.path[0:3] == 'tel:': # remove incorrectly formatted telephone links
                print('incorrect tel link: ' + hyperlink)
                return ''
            # double check
            cleaned_link = website + hyperlink

    return cleaned_link

单线程代码可以正常工作,平均大约0.7秒就可以抓取一页。多线程也以相同的速度刮擦???我最初设置它们的方式(如上图所示,第一个单元)似乎在单个线程中运行了一个大循环。我尝试编辑(下面的单元格中的代码)来解决此问题。

def crawler(frontier, req_pages, req_hops, output_directory):
    f = open("report.txt", "w")
    f.write("")
    f.close()

    #multithreaded#
    # with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
    #   executor.submit(crawler_thread, frontier, req_pages, req_hops, output_directory)

    #single threaded#
    # crawler_thread(frontier, req_pages, req_hops, output_directory)


    #testing stuff

    while not frontier.empty() and req_pages > 0  and req_hops > 0:
        with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
            executor.submit(crawler_thread, frontier, req_pages, req_hops, output_directory)


def crawler_thread(frontier, req_pages, req_hops, output_directory):

    lock = threading.Lock()

    lock.acquire()
    if req_pages > 0  and req_hops > 0:

        lock.release()

        start_time0 = time.perf_counter()

        try:

            url = frontier.get()
            print("Trying to Crawl: ", url)
            if url == '*': # denotes end of current hop has been hit
                lock.acquire()
                req_hops = req_hops - 1
                lock.release()
                frontier.put('*') # mark end of next hop
            elif crawl_permitted(url):
                parsed_url = urlparse(url)
                filename = parsed_url.netloc[4:] if parsed_url.netloc.find('www') == 0 else parsed_url.netloc
                filename = filename + parsed_url.path.replace('/', '-')

                if not already_saved(filename, output_directory):
                    response = requests.get(url, verify = False)
                    #time.sleep(10)  #sleep for a few seconds for politeness. To do : implement politeness windows based on the host's crawl_delay specified robots.txt
                    open(output_directory + '/' + filename, 'w', encoding='utf-8').write(response.text) 
                    lock.acquire()
                    req_pages = req_pages -1
                    lock.release()

                    soup = BeautifulSoup(response.text, 'html.parser')  
                    for link in soup.find_all('a'):

                        cleaned_link = clean_link(hyperlink = link.get('href'), website = url) 
                        if cleaned_link: 
                            frontier.put(cleaned_link)


        except:
            f = open("report.txt", "a")
            f.write("crawler failed on url: " + url + "\n")
            f.close()

        end_time0 = time.perf_counter()
        runtimes.append(end_time0-start_time0)
        # refresh pages, robots.txt, politeness window - to do

#create url queue (frontier) parse argv to pass into crawler 
frontier = seeds_to_frontier(seeds_filename = sys.argv[1])
req_pages = int(sys.argv[2])
req_hops = int(sys.argv[3])
output_directory = sys.argv[4]

start_time1 = time.perf_counter()

#crawl
crawler(frontier, req_pages, req_hops, output_directory)

end_time1 = time.perf_counter()

f = open("report.txt", "a")
f.write("Total Scrape Time: " +  str(end_time1-start_time1) + "\n")
f.write("Average Scrape Time for Each Url: " + str(statistics.mean(runtimes)) + "\n")  
f.close()

print("Total Scrape Time: " +  str(end_time1-start_time1) + "\n")

plt.plot(runtimes)
plt.ylabel("execution time (s)")
plt.xlabel("url queue iteration")
plt.title("scrapetime for each url")
plt.savefig("execution_time_of_scraped_urls")

1)我不确定是否加快了速度。 2)当我尝试指定在刮掉10页纸后停止时,刮板会继续前进,直到我强制退出为止。我拥有的第一个版本(单元1)没有执行此操作。

我的第一个实现和第二个实现有什么问题?由于抓取的速度,第一个似乎根本不是多线程的。第二个(我不确定抓取速度)没有停止在req_page参数上。在尝试对此进行多线程处理时,我哪里出错了?我的记忆被破坏了吗?

0 个答案:

没有答案