我试图抓住这个page!对于我正在进行的项目。我想获得所有页面的每辆车的详细信息(价格,里程,传输和年龄)。我在下面的代码中遇到的问题是:
我想知道是否有人能够帮助我调查并提出建议。感谢
from bs4 import BeautifulSoup
import requests
import urllib.parse
import csv
# the Toyota Camry model page is used
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
carLinks = set()
pageLinks = set()
data_set = []
parsed = urllib.parse.urlparse(soup.select('a')[0].get('href'))
nbPage = urllib.parse.parse_qs(parsed.query)['page'][1]
print("There are " + str(nbPage) + " web pages to process")
# for each web page that contains a grid of car offers
for i in range(1, int(nbPage), 1):
print("Processing web page: " + str(i))
# each car offer link is saved into the carLinks
for link in soup.select('#listContainer > div > section > div > tr > a'):
carLinks.add(link.get('href').replace("//", "http://"))
# the next url page is set
url = "https://www.olx.com.ng/vehicles/cars/toyota/?
search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + "
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
#for each car link
for carLink in carLinks:
print("Processing car page: " + carLink)
# we load the car page
r = requests.get(carLink)
data = r.text
soup = BeautifulSoup(data, "html.parser")
km = 0
transmission = ""
age = 0
price = 0
# for each attribute of the car
for info in soup.select("table.item tr div.pricelabel"):
# we keep the ones that we need
if info.select('.item')[0].text == u'Mileage':
km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", ""))
if info.select('.item')[0].text == u'Transmission':
transmission = info.select('.value')[0].text
if info.select('.item')[0].text == u'Year':
age = 2017 - int(info.select('.value')[0].text)
if info.select('.pricelabel')[0].text == u'Price':
price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", ""))
# each car is an array of four features added to the data_set
data_set.append([km, transmission, age, price])
# the data_set is save into the CSV file
fl = open('car_features.csv', 'w')
writer = csv.writer(fl)
writer.writerow(['km', 'transmission', 'age', 'price'])
for values in data_set:
writer.writerow(values)
fl.close()
答案 0 :(得分:0)
网站严重损坏,如果你继续下一页,你最终会在浏览器上点击加载循环,对于我来说,如果我直接粘贴它,页面501会返回500,如果我使用500中的下一个,我可以看到501,我得到一个永远不会结束的加载循环,我们使用重定向回到前一个终止我们的循环。
我还将lxml.html与cssselect一起使用,如果愿意,可以使用bs4
,逻辑是相同的,但我强烈建议使用lxml
,deps概述here,您还需要pip install cssselect
:
import requests
from lxml import html
from typing import Iterator
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
def parse_data(node_: html.Element, price: str) -> dict:
"""Parses the details section per individual car details page and returns a dict"""
# Price we pulled from main page.
details = {"price": price.strip("₦ ")}
# Details are in a table.
details_table = node_.cssselect("table.details")[0]
# The th has the description, the anchor has the value.
# we lower case and join the description, i.e "Type of car" -> type_of_car.
data = iter(details_table.cssselect("tr th, .value"))
details.update(("_".join(th.text.lower().split()), "".join(td.xpath(".//text()")).strip())
for th, td in (zip(data, data)))
return details
def get_link_and_price(s: requests.Session, node_: html.Element) -> Iterator[dict]:
"""Gets the link and the associated price from each tr."""
for child in node_.cssselect("table.offers td.offer"):
link, price = child.cssselect("a.link")[0].get("href"), child.cssselect(".price strong")[0].text
yield (parse_data(html.fromstring(s.get(link).content), price))
def start_request(url: str):
with requests.Session() as s:
get_ = s.get(url)
node = html.fromstring(get_.content)
# yield from subsequent iterators, i.e a dict of details.
yield from get_link_and_price(s, node)
# The site is broken, you click the next page button,
# and eventually you get stuck in a loading loop.
# At some stage the next should disappear,
# or only go back but it is wrongly implemented.
# This will stop when we try a next page,
# and end up back at the current url i.e ?page=501 -> ?page=500.
current_url = get_.url
next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
get_next = s.get(next_page)
node = html.fromstring(get_next.content)
# Keep going through pages till our break condition is met.
while current_url != get_next.url:
node = html.fromstring(get_next.content)
yield from get_link_and_price(s, node)
current_url = get_next.url
next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
get_next = s.get(next_page)
for dict_ in start_request(url):
print(dict_)
输出片段:
{'price': '3,300,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '65000'}
{'price': '3,000,000', 'offer_from': 'Individual', 'year': '2007', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '86500'}
{'price': '4,200,000', 'offer_from': 'Individual', 'year': '2013', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '39011'}
{'price': '4,500,000', 'offer_from': 'Business', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '93000'}
{'price': '890,000', 'offer_from': 'Business', 'year': '2001', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '110000'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2005', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,500,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,150,000', 'offer_from': 'Individual', 'year': '2002', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '167'}
{'price': '2,200,000', 'offer_from': 'Individual', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '24689'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2004', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '92,000'}