如何使用asyncio模块在python中解析网站?

时间:2018-08-03 11:55:14

标签: parsing beautifulsoup python-asyncio

我想解析一个网站,所以我写了两个版本的解析器(一个带有asyncio,另一个没有)。但是异步代码执行的代码与没有异步解析器的代码更多或相同。俄语代码中的一些细节认为没有问题。我今天开始使用asyncio。帮我。

此代码带有asyncio:

import asyncio
from bs4 import BeautifulSoup
from urllib.request import *
import pprint
import time

url = "https://besmart.kz"

def get_html(url):
    req = Request(url)
    html = urlopen(req).read()
    return html

async def get_stock_data(i):
    html = get_html(url + i['href'])
    soup = BeautifulSoup(html, 'html.parser')
    stock_data = {}

    try:
        stock_data["Old price"] = soup.find('span', class_='line-through red').find('span', class_='text-muted greyColor').text.strip().replace('\u2009','')
    except:
        stock_data["Old price"] = "Отсутствует"
    try:
        stock_data["Price"] = soup.find('div', id='calc-price', class_='price').text.strip().replace('\u2009','')
    except:
        stock_data["Price"] = "Ошибка"
    try:
        stock_data["Title"] = soup.find('div', class_='title').find('h1', itemprop='name').text.strip().replace('\u2009','')
    except:
        stock_data["Title"] = "Ошибка"
    pp = pprint.PrettyPrinter(indent=2)
    pp.pprint(stock_data)

if __name__ == "__main__":
    opener = build_opener()
    opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
    install_opener(opener)

    stock_list = []

    for i in range(1,4):
        html = get_html(url + "/?page=" + str(i))
        soup = BeautifulSoup(html, 'html.parser')
        stock_list.extend(soup.find_all('a', class_='deal__discount-kz'))

    ioloop = asyncio.get_event_loop()
    try:
        start = time.time()
        coroutines = [ioloop.create_task(get_stock_data(i)) for i in stock_list]
        ioloop.run_until_complete(asyncio.wait(coroutines))
    finally:
        ioloop.close()
        print(f"Время выполнения: {time.time() - start}")

这种情况没有:

import asyncio
from bs4 import BeautifulSoup
from urllib.request import *
import pprint
import time

url = "https://besmart.kz"

def get_html(url):
    req = Request(url)
    html = urlopen(req).read()
    return html

if __name__ == "__main__":
    opener = build_opener()
    opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
    install_opener(opener)

    stock_list = []

    for i in range(1,4):
        html = get_html(url + "/?page=" + str(i))
        soup = BeautifulSoup(html, 'html.parser')
        stock_list.extend(soup.find_all('a', class_='deal__discount-kz'))
    start = time.time()
    for i in stock_list:
        html = get_html(url + i['href'])
        soup = BeautifulSoup(html, 'html.parser')
        stock_data = {}

        try:
            stock_data["Old price"] = soup.find('span', class_='line-through red').find('span', class_='text-muted greyColor').text.strip()
        except:
            stock_data["Old price"] = "Отсутствует"
        try:
            stock_data["Price"] = soup.find('div', id='calc-price', class_='price').text.strip()
        except:
            stock_data["Price"] = "Ошибка"
        try:
            stock_data["Title"] = soup.find('div', class_='title').find('h1', itemprop='name').text.strip()
        except:
            stock_data["Title"] = "Ошибка"

        pp = pprint.PrettyPrinter(indent=2)
        pp.pprint(stock_data)

    print(f"Время выполнения: {time.time() - start}")

1 个答案:

答案 0 :(得分:2)

您可以使用n = n;模块来简化事情。例如:

aiohttp