使用Python进行Web抓取以使用BeautifulSoup收集有关属性的信息

时间:2019-03-18 03:42:12

标签: python html web-scraping beautifulsoup web-crawler

这是我第一次进行网页抓取。我想做的是获取有关物业的所有可能信息(位置,价格等)。 因此,我现在通过一些谷歌搜索收集的东西是这样的:

import requests
from bs4 import BeautifulSoup
from pprint import pprint as pp

rootURL = 'https://www.jawitz.co.za/'
response = requests.get(rootURL)                                                   
html = response.content
soup = BeautifulSoup(html,'lxml')

dropdown_list = soup.select(".primary .child-pages a")
cityLinks = [rootURL + dropdown_list_value['href'] for dropdown_list_value in dropdown_list]

# params for our request
params = {"province": "Western Cape", 
          "suburb": "Van Riebeeck Park", 
          "region": "Worcester", 
          "id": 929,
          "property_type": "Apartment"}

for city in cityLinks:  # Looping each city from the Apartment drop down list
    with requests.Session() as s:
        r= s.get(city)
        soup = BeautifulSoup(r.content)
        # getting some information should go here

现在,在查看资源后,我偶然发现了一些隐藏的输入信息,因此我不确定在该网站上如何检索信息。有关如何进行此操作的任何建议?我对如何访问资源中可用的信息感到困惑。

提前谢谢!

1 个答案:

答案 0 :(得分:0)

您可以将硒与bs4结合

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import math
from bs4 import BeautifulSoup as bs
import pandas as pd

def getData(properties):
    for aProperty in properties:
        soup = bs(str(aProperty) , 'lxml')
        price = soup.select_one('.property-price-heading').text if soup.select_one('.property-price-heading') is not None else ''       
        header = soup.select_one('.property-marketing-heading').text if soup.select_one('.property-marketing-heading') is not None else ''        
        beds = soup.select_one('.icon-beds span').text if soup.select_one('.icon-beds span') is not None else ''        
        baths = soup.select_one('.icon-baths span').text if soup.select_one('.icon-baths span') is not None else ''    
        garages = soup.select_one('.icon-garages span').text if soup.select_one('.icon-garages span') is not None else ''
        land = soup.select_one('.property-list-land-size .value').text if soup.select_one('.property-list-land-size .value') is not None else ''
        description = soup.select_one('.results-content').text.strip() if soup.select_one('.results-content') is not None else ''
        ref = soup.select_one('.property-list-webref').text.strip() if soup.select_one('.property-list-webref') is not None else ''
        items = [price, header, beds, baths, garages, land, description, ref ]
        result.append(items)
    return result

varUrl = 'https://www.jawitz.co.za/results/residential/for-sale/cape-town/all/?p={}&advanced_search=1&s=-price'
url = 'https://www.jawitz.co.za/results/residential/for-sale/cape-town/all/?advanced_search=1'

driver = webdriver.Chrome()
driver.get(url)
data =  WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".property-list-details")))
numResults = int(driver.find_element_by_id('id_property_count').text.split(' ')[0])

soup = bs(driver.page_source, 'lxml')
properties = soup.select('.property-list-details')

resultsPerPage = 10
numPages = math.ceil(numResults/resultsPerPage)

results = []

results.append(getData(properties))

if numPages > 1:
    for page in range(2, numPages + 1):
        url = varUrl.format(page)
        driver.get(url)
        data =  WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".property-list-details")))
        soup = bs(driver.page_source, 'lxml')
        properties = soup.select('.property-list-details')
        results.append(getData(properties))
        if page > 3:  #delete after testing
            break  #delete after testing

finalList = [item for sublist in results for item in sublist]
print(finalList)