我目前正在从事Web抓取的项目,但是我对网站的URL感到困难,因为在浏览页面时它不会改变。
我的目标是刮掉两页中的所有建筑物。
我抓取数据的唯一方法是使用检查工具,然后将包装纸复制到所有广告周围。
这是我的代码:
from bs4 import BeautifulSoup
import requests
import csv
import string
import glob
#Grab the soup (content)
source = requests.get("https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
soup = BeautifulSoup(source.content, 'html.parser')
#Loop through all the ads on the page
for ad in soup.find_all('div', {"data-id":"templateThumbnailItem"}):
if (soup.find('div', {"class":"price"})):
#Get the address
address = ad.find('span', {"class":"address"})
address = address.findChild().text
address = address.strip()
#Get the district
district = ad.find('span', {"class":"address"})
district = district.findChildren()[1].text
district = district.strip()
#Get the type
typeBuilding = ad.find('span', {"class":"category"}).text
typeBuilding = typeBuilding.strip()
typeBuilding = typeBuilding[0:7].strip()
#Get the Price
price = ad.find('span', {"itemprop":"price"}).text
price = price.replace('$','')
price = price.replace(u'\xa0','')
price = int(str(price))
cnt = cnt + 1
print(f'Adresse: {address}, Quartier: {district}, Type: {typeBuilding}, Prix: {price}$')
感谢您的帮助!
答案 0 :(得分:0)
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
with requests.Session() as req:
r = req.get(
"https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Address", "Quartier", "Type", "Price"])
for num in range(0, 40, 20):
data = {'startPosition': num}
r = req.post(url, json=data).json()
html = r["d"]["Result"]["html"]
soup = BeautifulSoup(html, 'html.parser')
prices = [format(int(price.get("content")), ',d') for price in soup.findAll(
"span", itemprop="price")]
block = soup.findAll("div", class_="location-container")
ty = [ty.div.get_text(strip=True) for ty in block]
add = [add.select_one(
"span.address div").text for add in block]
quartier = [quar.select_one(
"span.address div:nth-child(2)").text for quar in block]
final = zip(add, quartier, ty, prices)
writer.writerows(final)
main("https://www.centris.ca/Mvc/Property/GetInscriptions")
输出:View Online