当第一页链接与其他链接不同时,使用BeautifulSoup刮取多个网页

时间:2017-07-05 16:35:04

标签: python web-scraping beautifulsoup web-crawler urllib

我试图抓住这个page!对于我正在进行的项目。我想获得所有页面的每辆车的详细信息(价格,里程,传输和年龄)。我在下面的代码中遇到的问题是:

  1. 第一页链接与其他链接不同(无页码1<& page = 1>)
  2. 点击每个广告以获取详细信息后,汽车价格不在桌面内。
  3. 我想知道是否有人能够帮助我调查并提出建议。感谢

        from bs4 import BeautifulSoup
        import requests
        import urllib.parse
        import csv
    
        # the Toyota Camry model page is used
        url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
        r = requests.get(url)
        data = r.text
    
        soup = BeautifulSoup(data, "html.parser")
        carLinks = set()
        pageLinks = set()
        data_set = []
    
        parsed = urllib.parse.urlparse(soup.select('a')[0].get('href'))
        nbPage = urllib.parse.parse_qs(parsed.query)['page'][1]
        print("There are " + str(nbPage) + " web pages to process")
    
        # for each web page that contains a grid of car offers
        for i in range(1, int(nbPage), 1):
    
        print("Processing web page: " + str(i))
    
        # each car offer link is saved into the carLinks
        for link in soup.select('#listContainer > div > section > div > tr > a'):
        carLinks.add(link.get('href').replace("//", "http://"))
    
        # the next url page is set
        url = "https://www.olx.com.ng/vehicles/cars/toyota/?
        search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + "
        r = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
    
        #for each car link
        for carLink in carLinks:
    
        print("Processing car page: " + carLink)
    
        # we load the car page
        r = requests.get(carLink)
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
        km = 0
        transmission = ""
        age = 0
        price = 0
    
        # for each attribute of the car
        for info in soup.select("table.item tr div.pricelabel"):
    
        # we keep the ones that we need
        if info.select('.item')[0].text == u'Mileage':
            km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", ""))
        if info.select('.item')[0].text == u'Transmission':
            transmission = info.select('.value')[0].text
        if info.select('.item')[0].text == u'Year':
            age = 2017 - int(info.select('.value')[0].text)
        if info.select('.pricelabel')[0].text == u'Price':
            price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", ""))
    
        # each car is an array of four features added to the data_set
        data_set.append([km, transmission, age, price])
    
        # the data_set is save into the CSV file
        fl = open('car_features.csv', 'w')
        writer = csv.writer(fl)
        writer.writerow(['km', 'transmission', 'age', 'price'])
        for values in data_set:
        writer.writerow(values)
    
        fl.close()
    

1 个答案:

答案 0 :(得分:0)

网站严重损坏,如果你继续下一页,你最终会在浏览器上点击加载循环,对于我来说,如果我直接粘贴它,页面501会返回500,如果我使用500中的下一个,我可以看到501,我得到一个永远不会结束的加载循环,我们使用重定向回到前一个终止我们的循环。

我还将lxml.htmlcssselect一起使用,如果愿意,可以使用bs4,逻辑是相同的,但我强烈建议使用lxml,deps概述here,您还需要pip install cssselect

import requests
from lxml import html
from typing import Iterator

url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"


def parse_data(node_: html.Element, price: str) -> dict:
    """Parses the details section per individual car details page and returns a dict"""

    # Price we pulled from main page.
    details = {"price": price.strip("₦ ")}

    # Details are in a table.
    details_table = node_.cssselect("table.details")[0]

    # The th has the description, the anchor has the value.
    # we lower case and join the description, i.e "Type of car" -> type_of_car.
    data = iter(details_table.cssselect("tr th, .value"))
    details.update(("_".join(th.text.lower().split()), "".join(td.xpath(".//text()")).strip())
                   for th, td in (zip(data, data)))
    return details


def get_link_and_price(s: requests.Session, node_: html.Element) -> Iterator[dict]:
    """Gets the link and the associated price from each tr."""

    for child in node_.cssselect("table.offers td.offer"):
        link, price = child.cssselect("a.link")[0].get("href"), child.cssselect(".price strong")[0].text

        yield (parse_data(html.fromstring(s.get(link).content), price))


def start_request(url: str):
    with requests.Session() as s:
        get_ = s.get(url)
        node = html.fromstring(get_.content)

        # yield from subsequent iterators, i.e a dict of details.
        yield from get_link_and_price(s, node)

        # The site is broken, you click the next page button,
        #  and eventually you get stuck in a loading loop.
        # At some stage the next should disappear,
        # or only go back but it is wrongly implemented.
        # This will stop when we try a next page,
        #  and end up back at the current url i.e ?page=501 -> ?page=500.
        current_url = get_.url
        next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
        get_next = s.get(next_page)
        node = html.fromstring(get_next.content)

        # Keep going through pages till our break condition is met.
        while current_url != get_next.url:
            node = html.fromstring(get_next.content)
            yield from get_link_and_price(s, node)
            current_url = get_next.url
            next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
            get_next = s.get(next_page)

for dict_ in start_request(url):
    print(dict_)

输出片段:

{'price': '3,300,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '65000'}
{'price': '3,000,000', 'offer_from': 'Individual', 'year': '2007', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '86500'}
{'price': '4,200,000', 'offer_from': 'Individual', 'year': '2013', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '39011'}
{'price': '4,500,000', 'offer_from': 'Business', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '93000'}
{'price': '890,000', 'offer_from': 'Business', 'year': '2001', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '110000'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2005', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,500,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,150,000', 'offer_from': 'Individual', 'year': '2002', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '167'}
{'price': '2,200,000', 'offer_from': 'Individual', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '24689'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2004', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '92,000'}