如何使我的我们的爬虫更快地解决维基百科游戏?

时间:2019-04-16 17:55:53

标签: python python-3.x beautifulsoup python-requests

我正在制作Wikipedia搜寻器,但速度非常慢。我怎样才能更快?

我正在使用请求模块和beautifulsoup4来解析html页面。我已经尝试实现多线程,但是它仍然很慢。

import requests
from bs4 import BeautifulSoup as bs
from queue import Queue

baseURL = "https://en.wikipedia.org";

startURL = "/wiki/French_battleship_Courbet_(1911)"
endURL = "/wiki/Royal_Navy"

tovisit = Queue()
visited = []

def main():


    if (not checkValid(startURL)) or (not checkValid(endURL)):
        print("Invalid URLs entered.")
        quit()

    initCrawler(startURL)

def initCrawler(startURL):

    global tovisit
    global visited

    tovisit.put(startURL)

    finished = False

    while not finished:

        if tovisit.empty():
            finished = True
            continue

        url = tovisit.get()

        childlinks = linkCrawl(url)

        for i in childlinks:
            tovisit.put(i)

        visited.append(url)

def linkCrawl(url):

    global visited
    global tovisit
    global endURL

    print("crawling "+ url + "\n")

    r = requests.get(baseURL+url)
    soup = bs(r.content, "html.parser")

    rawlinks = soup.find_all('a', href=True)

    refinedlinks = []

    for rawLink in rawlinks:
        i = rawLink["href"]
        if i is None:
            continue
        # ensure what we have is a string
        if not (type(i) is str):
            continue
        # no poi
        if i in visited:
            continue
        if i in list(tovisit.queue):
            continue
        if not checkValid(i):
            continue
        if i == endURL:
            print("yay")
            exit()
        refinedlinks.append(i)

    return refinedlinks

def checkValid(url):
    if not url.startswith("/wiki/"):
        return False
    if url.startswith("/wiki/Special:"):
        return False
    if url.startswith("/wiki/Wikipedia:"):
        return False
    if url.startswith("/wiki/Portal:"):
        return False
    if url.startswith("/wiki/File:"):
        return False
    if url.endswith("(disambiguation)"):
        return False
    return True

if __name__ == "__main__":
    main()

我希望该机器人运行得更快,但实际上它运行缓慢。研究表明,最终多线程是不够的。

0 个答案:

没有答案
相关问题