从链接内的链接中抓取数据:Python Beautiful Soup

时间:2020-09-22 16:48:32

标签: python pandas beautifulsoup

我创建了一个脚本,该脚本从Landwatch.com上的密歇根州属性的结果页面提取数据。
这是我的脚本:

import requests
from requests import get

from bs4 import BeautifulSoup


headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
            }
           )
n_pages = 0
desc = []
for page in range(1,15000):
    n_pages += 1
    sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/Land/Page-' + str(page)
    r=get(sapo_url, headers=headers)
    page_html = BeautifulSoup(r.text, 'html.parser')
    house_containers = page_html.find_all('div', class_="propName")
    if house_containers != []:
        for container in house_containers:
            desc.append(container.getText(strip=True))
    else:
        break

print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))



import pandas as pd
df = pd.DataFrame({'description': desc}) 
import pyodbc
sql_conn = pyodbc.connect('DRIVER={SQL Server};SERVER=LAPTOP-5LE89UIK\MSSQLSERVER01;DATABASE=Dave;')
cursor = sql_conn.cursor()
cursor.execute('Create TABLE Dave..Property1 (descrip varchar(max), Acreage float, Price money, City varchar(max), County varchar(max)), priceperacre float')
sql_conn.commit()

for row in df.itertuples():
    cursor.execute('''
             INSERT INTO Dave..Property1 (descrip)
             VALUES (?); 
            ''', 
            row.description
            )
sql_conn.commit

sql = ("Update Dave..Property1 SET Acreage = cast(REPLACE(SUBSTRING(descrip, 1, charindex('Acre', descrip, 1)), 'A', '') as float), "
       "Price = (CASE WHEN charindex('$', descrip) = 0 THEN NULL ELSE SUBSTRING(descrip, charindex('$', descrip), LEN(descrip)) END),"
       "City = CASE WHEN charindex(',', descrip) = 0 THEN NULL ELSE left(substring(descrip, charindex(' ', descrip) + 9, charindex(',', descrip)), "
        "                                       charindex(',', substring(descrip, "
                                                "charindex(' ', descrip) + 9, charindex(',', descrip))))"
                                                "END, "
                                             
                                                
        "COUNTY = CASE WHEN charindex(',', descrip) = 0 OR Charindex('$', descrip) = 0 THEN NULL ELSE "
        "REPLACE(SUBSTRING(descrip, charindex(',', descrip) + 1, charindex(', MI$', descrip) - charindex(',', descrip)), ',', '') END" 
       
       
       )

cursor.execute(sql)
sql_conn.commit()
sql_clean = ("Update Dave..Property1 SET CITY = LTRIM(RTRIM(REPLACE(CITY, ',', ''))), "
              "COUNTY = SUBSTRING(COUNTY, 1, (CASE WHEN charindex (',', COUNTY) = 0 THEN LEN(COUNTY) ELSE  " 
             "(charindex(',', COUNTY)-1) END))")
cursor.execute(sql_clean)
sql_conn.commit()   

sql_addperacre = ("UPDATE Dave..Property1 SET pricePerAcre = price / CASE WHEN Acreage = 0 OR acreage IS NULL THEN 1 ELSE Acreage END")
cursor.execute(sql_addperacre) 
sql_conn.commit()

sql_removeCounty = ("Update Dave..Property1 SET COUNTY = REPLACE(County, 'County', '')")
cursor.execute(sql_removeCounty) 
sql_conn.commit()

sql_removeCountySpace = ("Update Dave..Property1 SET County = ltrim(rtrim(County))")
cursor.execute(sql_removeCountySpace) 
sql_conn.commit()

import numpy as np
import pandas as pd
from plotnine  import *

upSQL = ("SELECT pricePerAcre, city, county from dave..property1 WHERE COUNTY IN ('Alger', 'Marquette', 'DIckinson', 'Gogebic', 'Iron', 'Houghton', 'Ontonagon', 'Menominee', 'Delta', 'Schoolcraft', 'Luce', 'Mackinac', 'Chippewa')")
UPSQL_Query = pd.read_sql_query(upSQL, sql_conn)
updf = pd.DataFrame(UPSQL_Query, columns=['pricePerAcre', 'city', 'county'])
ggplot(updf, aes(x= 'county', y= 'pricePerAcre', fill= 'county')) + geom_violin() + ylim(0, 10000)
# df.to_csv('test4.csv', encoding = 'utf-8')
# df = pd.DataFrame(house_containers, columns = ['desc'])
# df.to_csv ('test.csv')
# print(df)

我想扩展脚本以打开每个属性的链接并提取嵌入在每个页面中的地理坐标。相信我需要创建另一个脚本来打开列表中的每个URL(唯一的URL具有PID),但是不确定如何开始。任何建议将不胜感激

1 个答案:

答案 0 :(得分:0)

要获取属性的坐标,您可以在<iframe>内解析google map URL。例如:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs


for page in range(1, 10):
    print('Page {}...'.format(page))
    print()

    url = 'https://www.landwatch.com/Michigan_land_for_sale/Land/Page-{}'.format(page)
    soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
    
    for a in soup.select('.propName a'):
        prop_url = 'https://www.landwatch.com' + a['href']
        prop_soup = BeautifulSoup(requests.get(prop_url, headers=headers).content, 'html.parser')

        google_maps_url = prop_soup.select_one('#iframe-map iframe')

        print(prop_soup.h1.text)
        if google_maps_url:
            print(parse_qs(urlparse(google_maps_url['src']).query).get('q') )
        print('-' * 80)

打印:

...

--------------------------------------------------------------------------------
Marion, Osceola County, MI Land For Sale - 145 Acres
['44.0227,-85.1752']
--------------------------------------------------------------------------------
Nashville, Barry County, MI Land For Sale - 1.36 Acres
['42.5965,-85.09523']
--------------------------------------------------------------------------------

... and so on.