您好,我制作了一个网络抓取工具,用于从api.mercadolibre和网页中获取一些数据,因为api不能提供真实的数据,我在异步部分使用了aiohttp和async模块,在抓取中使用了BeautifulSoup ,脚本花费的时间大约是6个小时,但我需要它要更快,因为我要删除大约320K网址,但以后会增加到100万。我将不胜感激,这是代码。这是一个Django应用。
import random
import asyncio
from aiohttp import ClientSession
from products.models import Product, ProductVariation
from time import time
from bs4 import SoupStrainer, BeautifulSoup
from django.utils import timezone
from datetime import timedelta
import aiohttp
async def fetch(product, session):
try:
async with session.get(product.url_product, timeout=None) as response:
text = await response.read()
page_content = BeautifulSoup(text.decode('utf-8'), "html.parser", parse_only=content_product)
content = page_content.find('div', attrs={"class": "item-conditions"})
quantity_content = page_content.find('span', attrs={"class": "dropdown-quantity-available"})
price_content = page_content.find('span', attrs={"class": "price-tag-fraction"})
sold_quantity = 0
if content is not None:
content_split = content.text.split()
for t in content_split:
try:
sold_quantity = int(t.replace('.','').replace(',','.'))
except ValueError:
pass
else:
sold_quantity = 0
if quantity_content is not None:
quantity_split = quantity_content.text.replace('(', '').replace(')', '').split()
for a in quantity_split:
try:
quantity = int(a.replace('.','').replace(',','.'))
except ValueError:
pass
else:
quantity = 0
if price_content is not None:
price_split = price_content.text.replace('(', '').replace(')', '').split()
for m in price_split:
try:
price = int(m.replace('.','').replace(',','.'))
except ValueError:
pass
else:
price = 0
day_before = ProductVariation.objects.filter(product=product, date_search=(today_date - date_before))
if day_before:
sold_day_before = day_before[0].sold_quantity
else:
sold_day_before = sold_quantity
if not ProductVariation.objects.filter(product=product, date_search=today_date):
diference_day_before = sold_quantity - sold_day_before
if diference_day_before < 0:
diference_day_before = 0
list_variations.append(
ProductVariation(
product=product,
sold_quantity=sold_quantity,
available_quantity=quantity,
diference_day_before=diference_day_before
)
)
print("Variation append")
else:
print("Variation already exists")
pass
return text
except aiohttp.client_exceptions.ServerDisconnectedError:
pass
async def bound_fetch(sem, product, session):
# Getter function with semaphore.
async with sem:
await fetch(product, session)
async def run(r):
url = "{}"
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(10000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(bound_fetch(sem, p[i], session))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
list_variations = []
today_date = timezone.now()
date_before = timedelta(1)
number = 10000
content_product = SoupStrainer(id='short-desc')
p = Product.objects.filter(sub_category__category__channel__name='Mercadolibre')
count = p.count()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(count))
loop.run_until_complete(future)