我正在学习python&试图抓取一个网站,在每个页面上有10个属性列表。我想从每个页面上的每个列表中提取信息。 前5页的代码如下: -
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,5):
pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
urls.append(pages)
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
Data = []
for urls in hrefs:
pages = requests.get(urls)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
以上代码对我不起作用。请让我知道正确的编码以达到目的。
答案 0 :(得分:2)
您的代码中有一个问题是您将变量“urls”声明了两次。您需要更新以下代码:
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,6):
pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
print Data
答案 1 :(得分:1)
在代码中使用标头并使用字符串连接而不是.format(i)
代码看起来像这样
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,6):
pages = 'http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-'i+'?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true'
urls.append(pages)
Data = []
for info in urls:
headers = {'User-agent':'Mozilla/5.0'}
page = requests.get(info,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
print Data
答案 2 :(得分:1)
您可以告诉BeautifulSoup只为您提供包含href
的链接,以使您的代码更安全。此外,您可以在底部提取next >
链接,而不是修改您的网址以包含网页编号。这也将在返回最终页面时自动停止:
import requests
from bs4 import BeautifulSoup
base_url = r"http://www.realcommercial.com.au"
url = base_url + "/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true"
data = []
for _ in range(10):
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
hrefs = [link['href'] for link in soup.find_all('a', attrs={'class' : 'details-panel'}, href=True)]
for href in hrefs:
pages = requests.get(href)
soup_2 = BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 = soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1 = soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name = [Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1 = soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name = [Agent_Name.text.strip() for Agent_Name in Agent_1]
data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
# Find next page (if any)
next_button = soup.find('li', class_='rui-pagination-next')
if next_button:
url = base_url + next_button.parent['href']
else:
break
for entry in data:
print(entry)
print("---------")