以下代码适用于从olx抓取单页,需要帮助才能从多页中提取。
import requests
import re
from django.conf.urls import url, include
url_base = "https://www.olx.in"
url = url_base + "/hyderabad_g4058526/cars_c5"
info_labels = ("itemDetails","itemPrice", "itemTitle", "item-location")
info_pattern = r'(?s)(.?)'
link_pattern = r'(?s)?data-aut-id="itemBox".*?href="([^"]+?)"'
response = requests.get(url)
cars = list(zip( *(re.findall(info_pattern.format(label), response.text) for label in info_labels),
(url_base + path for path in re.findall(link_pattern, response.text)) ))
print(cars)
with open('cars_olx.txt', 'w', encoding='utf-8') as f:
for item in cars:
f.write(u"%s\n" % str(item))