Question

我试图通过打开抽象转储文件中的每个URL并使用BS4对其进行解析来解析mediawiki转储文件。我大约有60万个URL，据我估计，这将需要200个小时。

sentenceTokens = []


with open('arwiki-latest-abstract.txt', newline='', encoding='utf-8') as textFile:  # open text file
    for line in textFile:
        if '<url>' in line:
            line = re.sub('<[^>]+>', '', line)  # remove <> and anything within
            line = re.sub('\n', '', line)
            print(line)
            requestURL = urllib.request.Request(line, headers={'User-Agent': 'Mozilla/5.0'})  # read webpage
            try:
                scrapeURL = urllib.request.urlopen(requestURL)  # scrape webpage
            except urllib.error.HTTPError as err:
                continue
            article = scrapeURL.read()
            parsedArticle = bs.BeautifulSoup(article, 'lxml')  # parse webpage
            paragraphs = parsedArticle.find_all('p')  # split article to paragraphs
            textFromURL = ""
            for paragraph in paragraphs:
                textFromURL += paragraph.text  # concat paragraphs
            textFromURL = re.sub('[\[].*?[\]]', ' ', textFromURL)  # remove [] and anything within
            textFromURL = re.sub(r'\s+', ' ', textFromURL)  # fix spaces
            textFromURL = araby.strip_tashkeel(textFromURL)
            sentenceTokens += nltk.sent_tokenize(textFromURL)
worddict, wordcount = vocab.build_dictionary(sentenceTokens)
vocab.save_dictionary(worddict, wordcount, 'D:\\Unsupervisedsummarization')
with open('listtext.txt', 'wb', encoding='utf-8') as strlist:
    pickle.dump(sentenceTokens, strlist)

如果任何人都可以并行化此代码，或者提供一些有关如何使其运行更快的提示。谢谢！

Answer 1

正如评论中所述，您需要并行处理url请求和bs4解析，这是最慢的过程，如示例中所示：

代码

#!/usr/bin/python3
# -*- coding: utf-8 -*-

from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads

limit = 10
http_ok = [200]


async def scrape(url_list):

    tasks = list()

    sem = Semaphore(limit)

    async with ClientSession() as session:
        for url in url_list:
            task = ensure_future(scrape_bounded(url, sem, session))
            tasks.append(task)

        result = await gather(*tasks)

    return result


async def scrape_bounded(url, sem, session):
    async with sem:
        return await scrape_one(url, session)


async def scrape_one(url, session):

    try:
        async with session.get(url) as response:
            content = await response.read()
    except client_exceptions.ClientConnectorError:
        print('Scraping %s failed due to the connection problem', url)
        return False

    if response.status not in http_ok:
        print('Scraping%s failed due to the return code %s', url, response.status)
        return False

    content = loads(content.decode('UTF-8'))

    return content


if __name__ == '__main__':
    urls = ['http://demin.co:8080/echo1/', 'http://demin.co:8080/echo1/']
    res = run(scrape(urls))

    print(dumps(res, indent=4))

输出

[
    {
        "reply": "pong",
        "url": "GET /",
        "id": "echo1"
    },
    {
        "reply": "pong",
        "url": "GET /",
        "id": "echo1"
    }
]

如何使这个python脚本运行得更快？

1 个答案: