处理分页BeautifulSoup -Selenium

时间:2015-11-27 14:42:17

标签: python selenium beautifulsoup

我一直在努力奋斗一个多星期了。我正在尝试学习Python,并在同一时间构建一些对我有用的东西 - 这有助于我找到一个新的出租房屋。

我让所有代码按照我想要的方式工作 - 除了我无法获得所有550个属性,我只能获得第1页的前25个。我尝试了几种方法,但似乎没有任何工作。

如果我使用urlopen并获取主网址+页码“2_p /”的正则表达式,例如我得到一个urlerror未知网址“h”。

如果我使用webdriver- firefox尝试查找www.h.com。我真的可以用一些帮助。附上是我的代码 - 抱歉它有点混乱,可能有点长 - 我只是在学习,所以不要残忍。

from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re

pages1 =  set()
next_page = ()
csv_output = [ ]


def getLinks(url):
    global pages


    driver = webdriver.Firefox()
    driver.get(url)
    time.sleep(3)
    pageSource = driver.page_source
    bsObj = BeautifulSoup(pageSource)
    for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
        if 'href' in addr_link.attrs:
            if addr_link['href'] not in pages1:
                newPage = addr_link.attrs['href']
                pages1.add(newPage)
                #print(newPage)
print(len(pages1))
for link in pages1:
    property_url = link
    getData(property_url)



def getData(url):

    base_url = 'http://www.zillow.com'
    final_url = base_url+url

    html = urlopen(final_url)

    bsObj = BeautifulSoup(html)


try:
# Gets Property Address
    address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
    s_address = address.get_text()
    print(address)
# Gets number of bedsrooms 
    beds = bsObj.find("span", {"class":"addr_bbs"})
    s_beds = beds.get_text()

# Gets number of bathsrooms 
    baths = beds.find_next("span", {"class":"addr_bbs"})
    s_baths = baths.get_text()

# Gets sqft
    sqft = baths.find_next("span", {"class":"addr_bbs"})
    s_sqft = sqft.get_text()

# Gets rent_amount 
    rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
    s_rent_amount = rent_amount.get_text()

# Gets rent_zestiment

    zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
    rent_zestiment = zestiment_holder.find_next("span")
    s_rent_zestiment = rent_zestiment.get_text()


# Gets Date posted on Zillow
    for time_posted in bsObj(text=re.compile("Posted")):
        posted = time_posted.parent.get_text()
        if 'hours' in posted:
            date_on_zillow = datetime.date.today()
            posted_date = date_on_zillow

    else:
        days_subtracted = int(re.search(r'\d+', posted).group())
        posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)

# Gets Property subdivision 
    subdivision = bsObj.find(id="hdp-neighborhood").h2
    s_subdivision = subdivision.get_text()

# Gets property_manager_name 
    property_manager_name = bsObj.find("span", {"class":"snl company-name"})
    s_property_manager_name = property_manager_name.get_text()

# Gets property_manager_phone
    property_manager_phone = bsObj.find("span", {"class":"snl phone"})
    s_property_manager_phone = property_manager_phone.get_text()

# Gets disc_of_property 
    disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
    s_disc_of_property = disc_of_property.encode("utf-8")

# Gets url_of_listing so I can see Photos if interested
    main_url = 'http://www.zillow.com'
    url2 = url
    listing_url = main_url+url2
    s_listing_url = listing_url

except AttributeError as e:
    return None

csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]


csv_output.append(csv_data)


resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()

header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)

getLinks("http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.371994,-93.635788,38.697836,-95.077744_rect/9_zm/")  

编辑: 被评论出来的'while'是我最后一次处理分页的尝试。

0 个答案:

没有答案