我试图将所有房屋的页面拉出租 - 我的一切工作都相当不错(我是初学者 - 不到一个月的编码)但我的最终csv输出是我期待的一半 - 我检查了Pages1的长度,它说我有84个链接,但我的最后一次迭代只给了我45个处理过的链接。任何帮助谢谢。
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re
pages1 = set()
next_page = set()
csv_output = [ ]
def getLinks(url):
global pages
global next_pages
counter = 4
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
# first list len check == 0 in shell
print(len(pages1))
while counter > 0:
base_url1 = url
url_array = str(counter)+'_p/'
final_url1 = base_url1+url_array
next_page.add(final_url1)
counter -= 1
print(next_page)
for page in next_page:
folling_page = page
print(folling_page+"test")
html = urlopen(folling_page)
bsObj = BeautifulSoup(html)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages1:
newPage = addr_link.attrs['href']
print(newPage)
pages1.add(newPage)
# The follow print len shows that I have 84 links in pages1.
print(len(pages1))
#The follow for loop only send 43 links through getData(link)
for link in pages1:
getData(link)
def getData(link):
base_url = 'http://www.zillow.com'
final_url = base_url+link
html = urlopen(final_url)
bsObj = BeautifulSoup(html)
try:
# Address
address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
s_address = address.get_text()
# beds
beds = bsObj.find("span", {"class":"addr_bbs"})
s_beds = beds.get_text()
# baths
baths = beds.find_next("span", {"class":"addr_bbs"})
s_baths = baths.get_text()
# sqft
sqft = baths.find_next("span", {"class":"addr_bbs"})
s_sqft = sqft.get_text()
# rent_amount
rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
s_rent_amount = rent_amount.get_text()
# rent_zestiment
zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
rent_zestiment = zestiment_holder.find_next("span")
s_rent_zestiment = rent_zestiment.get_text()
# Date on Zillow
for time_posted in bsObj(text=re.compile("Posted")):
posted = time_posted.parent.get_text()
if 'hours' in posted:
date_on_zillow = datetime.date.today()
posted_date = date_on_zillow
else:
days_subtracted = int(re.search(r'\d+', posted).group())
posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)
# subdivision
subdivision = bsObj.find(id="hdp-neighborhood").h2
s_subdivision = subdivision.get_text()
# property_manager_name
property_manager_name = bsObj.find("span", {"class":"snl company-name"})
s_property_manager_name = property_manager_name.get_text()
# property_manager_phone
property_manager_phone = bsObj.find("span", {"class":"snl phone"})
s_property_manager_phone = property_manager_phone.get_text()
# disc_of_property
disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
s_disc_of_property = disc_of_property.encode("utf-8")
# url_of_listing
main_url = 'http://www.zillow.com'
url2 = link
listing_url = main_url+url2
s_listing_url = listing_url
except AttributeError as e:
return None
csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]
csv_output.append(csv_data)
resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()
header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)
getLinks("http://www.zillow.com/homes/for_rent/Clay-County-MO/house,mobile_type/126_rid/6m_days/39.617854,-93.687974,38.946059,-95.129929_rect/9_zm/")