用美丽的汤和Python抓取到CSV

时间:2020-01-04 00:21:30

标签: python pandas selenium beautifulsoup export-to-csv

尝试使用Beautiful Soup和Selenium从房地产网站上的列表中抓取地面尺寸(平方英尺)和地块尺寸(公顷)。

地板尺寸在控制台中可以正常打印

image

但是在写入csv文件时,不会提取地板尺寸栏下的“ sq ft”信息

image

似乎BS4在ID元素中规定了“ sq ft”之后,似乎返回了该值,而是在写入csv时将所有其他“ sq ft”文本传递给了其他所有url。正如您在(image)上看到的那样,尽管其中两个链接也都有公顷,但其中两个清单具有此功能:

http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ

有人可以解释为什么平方英尺在控制台上打印但未写入到CSV吗?任何帮助将不胜感激。

相关HTML,其中CP2_CPContent_conDetails1_divDetails是楼层大小和批量的相关定位器:

<div id="CP2_CPContent_conDetails1_divDetails">
                0.3 Acres <br>(0.12 Hectares)
                <div class="clear"></div>

                <div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
                      Potential building size of 6,458 sq ft (600 sq m)<br>
                </div>

代码如下:

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
        lot_size = lot_size.replace("(", "").replace(")", "")
        print(lot_size)
        return lot_size
    except:
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
        print(floor_size)
        return floor_size
    except:
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

1 个答案:

答案 0 :(得分:1)

使用您的代码获取Hectares没问题。

我对sq ft有疑问-它甚至没有显示它。全部是因为您在

中使用find()而不是find_all()
 for element in soup.find()

但是find()不会返回包含元素的列表,而是单个元素,然后for不会从列表中获取此元素,但是它可能会获取其子元素,并且搜索错误的sq ft地方。


from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
            if lot_size:
                lot_size = lot_size.replace("(", "").replace(")", "")
                lot_size = lot_size.strip()
            print('lot_size:', lot_size)
        return lot_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
            if floor_size:
                floor_size = floor_size.strip()
            print('floor_size:', floor_size)
        return floor_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])
        print('-------------------')

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

CSV:

Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"*  Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,