这是我第一次进行网页抓取。我想做的是获取有关物业的所有可能信息(位置,价格等)。 因此,我现在通过一些谷歌搜索收集的东西是这样的:
import requests
from bs4 import BeautifulSoup
from pprint import pprint as pp
rootURL = 'https://www.jawitz.co.za/'
response = requests.get(rootURL)
html = response.content
soup = BeautifulSoup(html,'lxml')
dropdown_list = soup.select(".primary .child-pages a")
cityLinks = [rootURL + dropdown_list_value['href'] for dropdown_list_value in dropdown_list]
# params for our request
params = {"province": "Western Cape",
"suburb": "Van Riebeeck Park",
"region": "Worcester",
"id": 929,
"property_type": "Apartment"}
for city in cityLinks: # Looping each city from the Apartment drop down list
with requests.Session() as s:
r= s.get(city)
soup = BeautifulSoup(r.content)
# getting some information should go here
现在,在查看资源后,我偶然发现了一些隐藏的输入信息,因此我不确定在该网站上如何检索信息。有关如何进行此操作的任何建议?我对如何访问资源中可用的信息感到困惑。
提前谢谢!
答案 0 :(得分:0)
您可以将硒与bs4结合
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import math
from bs4 import BeautifulSoup as bs
import pandas as pd
def getData(properties):
for aProperty in properties:
soup = bs(str(aProperty) , 'lxml')
price = soup.select_one('.property-price-heading').text if soup.select_one('.property-price-heading') is not None else ''
header = soup.select_one('.property-marketing-heading').text if soup.select_one('.property-marketing-heading') is not None else ''
beds = soup.select_one('.icon-beds span').text if soup.select_one('.icon-beds span') is not None else ''
baths = soup.select_one('.icon-baths span').text if soup.select_one('.icon-baths span') is not None else ''
garages = soup.select_one('.icon-garages span').text if soup.select_one('.icon-garages span') is not None else ''
land = soup.select_one('.property-list-land-size .value').text if soup.select_one('.property-list-land-size .value') is not None else ''
description = soup.select_one('.results-content').text.strip() if soup.select_one('.results-content') is not None else ''
ref = soup.select_one('.property-list-webref').text.strip() if soup.select_one('.property-list-webref') is not None else ''
items = [price, header, beds, baths, garages, land, description, ref ]
result.append(items)
return result
varUrl = 'https://www.jawitz.co.za/results/residential/for-sale/cape-town/all/?p={}&advanced_search=1&s=-price'
url = 'https://www.jawitz.co.za/results/residential/for-sale/cape-town/all/?advanced_search=1'
driver = webdriver.Chrome()
driver.get(url)
data = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".property-list-details")))
numResults = int(driver.find_element_by_id('id_property_count').text.split(' ')[0])
soup = bs(driver.page_source, 'lxml')
properties = soup.select('.property-list-details')
resultsPerPage = 10
numPages = math.ceil(numResults/resultsPerPage)
results = []
results.append(getData(properties))
if numPages > 1:
for page in range(2, numPages + 1):
url = varUrl.format(page)
driver.get(url)
data = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".property-list-details")))
soup = bs(driver.page_source, 'lxml')
properties = soup.select('.property-list-details')
results.append(getData(properties))
if page > 3: #delete after testing
break #delete after testing
finalList = [item for sublist in results for item in sublist]
print(finalList)