网页抓取从网站获取数据

时间:2017-06-12 05:53:11

标签: python python-3.x web-scraping beautifulsoup

我正在学习python&试图抓取一个网站,在每个页面上有10个属性列表。我想从每个页面上的每个列表中提取信息。 前5页的代码如下: -

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,5):
    pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
    urls.append(pages)
    for info in urls:
         page = requests.get(info)
         soup = BeautifulSoup(page.content, 'html.parser')
         links = soup.find_all('a', attrs ={'class' :'details-panel'})
         hrefs = [link['href'] for link in links]
         Data = []
         for urls in hrefs:
             pages = requests.get(urls)
             soup_2 =BeautifulSoup(pages.content, 'html.parser')
             Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
             Address = [Address.text.strip() for Address in Address_1]
             Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
             Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
             Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
             Area = [Area.text.strip() for Area in Area_1]
             Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
             Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
             Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
             Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
             Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

以上代码对我不起作用。请让我知道正确的编码以达到目的。

3 个答案:

答案 0 :(得分:2)

您的代码中有一个问题是您将变量“urls”声明了两次。您需要更新以下代码:

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,6):
    pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
    urls.append(pages)

Data = []
for info in urls:
    page = requests.get(info)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

    for href in hrefs:
        pages = requests.get(href)
        soup_2 =BeautifulSoup(pages.content, 'html.parser')
        Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
        Address = [Address.text.strip() for Address in Address_1]
        Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
        Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
        Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
        Area = [Area.text.strip() for Area in Area_1]
        Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
        Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
        Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
        Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
        Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

print Data

答案 1 :(得分:1)

在代码中使用标头并使用字符串连接而不是.format(i)

代码看起来像这样

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,6):
    pages = 'http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-'i+'?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true'
    urls.append(pages)

Data = []
for info in urls:
    headers = {'User-agent':'Mozilla/5.0'}
    page = requests.get(info,headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

for href in hrefs:
    pages = requests.get(href)
    soup_2 =BeautifulSoup(pages.content, 'html.parser')
    Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
    Address = [Address.text.strip() for Address in Address_1]
    Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
    Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
    Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
    Area = [Area.text.strip() for Area in Area_1]
    Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
    Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
    Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
    Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
    Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

print Data

答案 2 :(得分:1)

您可以告诉BeautifulSoup只为您提供包含href的链接,以使您的代码更安全。此外,您可以在底部提取next >链接,而不是修改您的网址以包含网页编号。这也将在返回最终页面时自动停止:

import requests 
from bs4 import BeautifulSoup

base_url = r"http://www.realcommercial.com.au"
url = base_url + "/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true"
data = []

for _ in range(10):
    print(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    hrefs = [link['href'] for link in soup.find_all('a', attrs={'class' : 'details-panel'}, href=True)]

    for href in hrefs:
         pages = requests.get(href)
         soup_2 = BeautifulSoup(pages.content, 'html.parser')
         Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
         Address = [Address.text.strip() for Address in Address_1]
         Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
         Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
         Area_1 = soup_2.find_all('ul', attrs={'class' :'summaryList'})
         Area = [Area.text.strip() for Area in Area_1]
         Agency_1 = soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
         Agency_Name = [Agency_Name.text.strip() for Agency_Name in Agency_1]
         Agent_1 = soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
         Agent_Name = [Agent_Name.text.strip() for Agent_Name in Agent_1]

         data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

    # Find next page (if any)
    next_button = soup.find('li', class_='rui-pagination-next')

    if next_button:
        url = base_url + next_button.parent['href']
    else:
        break


for entry in data:
    print(entry)
    print("---------")
相关问题