我正在尝试使用以下代码抓取网页: -
import requests
from bs4 import BeautifulSoup
page = requests.get("http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true")
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for urls in hrefs:
pages = requests.get(urls)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
上面的代码只返回hrefs中第一个URL的详细信息。
['Mon 05-Jun-17'] ['261 Keilor Road, Essendon, Vic 3040']
我需要循环来遍历hrefs&中的每个URL。从hrefs中的每个URL返回类似的详细信息。 请在上面的代码中建议我应该添加/编辑的内容。 任何帮助都将受到高度赞赏。
谢谢
答案 0 :(得分:1)
您在每次迭代时覆盖Address
和Sold_Date
个对象:
# after assignment previous data will be lost
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Address = [Address.text.strip() for Address in Address_1]
尝试在循环外部初始化空list
并扩展它们
import requests
from bs4 import BeautifulSoup
page = requests.get("http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true")
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs={'class': 'details-panel'})
hrefs = [link['href'] for link in links]
addresses = []
sold_dates = []
for urls in hrefs:
pages = requests.get(urls)
soup_2 = BeautifulSoup(pages.content, 'html.parser')
dates_tags = soup_2.find_all('li', attrs={'class': 'sold-date'})
sold_dates += [date_tag.text.strip() for date_tag in dates_tags]
addresses_tags = soup_2.find_all('p', attrs={'class': 'full-address'})
addresses += [address_tag.text.strip() for address_tag in addresses_tags]
给我们
>>>sold_dates
[u'Tue 06-Jun-17',
u'Tue 06-Jun-17',
u'Tue 06-Jun-17',
u'Tue 06-Jun-17',
u'Tue 06-Jun-17',
u'Tue 06-Jun-17',
u'Tue 06-Jun-17',
u'Mon 05-Jun-17',
u'Mon 05-Jun-17',
u'Mon 05-Jun-17']
>>>addresses
[u'141 Napier Street, Essendon, Vic 3040',
u'5 Loupe Crescent, Leopold, Vic 3224',
u'80 Ryrie Street, Geelong, Vic 3220',
u'18 Boase Street, Brunswick, Vic 3056',
u'130-186 Buckley Street, West Footscray, Vic 3012',
u'223 Park Street, South Melbourne, Vic 3205',
u'48-50 The Centreway, Lara, Vic 3212',
u'14 Webster Street, Ballarat, Vic 3350',
u'323 Nepean Highway, Frankston, Vic 3199',
u'341 Buckley Street, Aberfeldie, Vic 3040']
答案 1 :(得分:1)
行为正确。 您需要将信息存储在外部列表中,然后将其返回。
import requests
from bs4 import BeautifulSoup
page = requests.get("http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true")
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
Data = []
for urls in hrefs:
pages = requests.get(urls)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Data.append(Sold_Date + Address)
return Data