在此网站上的一些同事的帮助下,我已经对房地产数据进行了网络抓取。
它工作正常,但是在爬到第6/7页或其他页面后,弹出了一个典型的cookie警告cookie,似乎破坏了我在CSV文件中的输出。
有没有办法处理弹出窗口?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
网站中的cookie脚本:
(function(){function g(a){return {get:function(b){var c = JSON.parse(a.getItem(b)); return!c || Date.parse(c.expires )<=(新日期).getTime()?(a.removeItem(b),空):c.value},set:function(b,c,d){c = {value:c,expires:d。 toUTCString()}; a.setItem(b,JSON.stringify(c))},删除:function(b){a.removeItem(b)}}}函数d(a,b,c,d){this。 parseCommand = function(e,g){function h(){var a = JSON.stringify({messageId:k,value:l ||!1}); window.parent.postMessage(a,“ ”) } var m = q [a],n = e.action,p = e.key,k = e.messageId,f = e.siteId,f = d?p:p +“:” + f,l = e.value,r = e.expiresMinutes || 1440 (e.expiresDays || 365),s = function(){var a = new Date; a.setTime(a.getTime() + 6E4 * r);返回a}(); if(!function(){var a = {_ hjSet:c,_hjGet:b,_hjRemove:c} [n] || [];返回0 <= a.indexOf (“ ”)|| 0 <= a.indexOf(g)}())抛出错误(“键上不允许使用命令“ + n +”:“ + p); switch(n){case” _hjSet“ :m.set(f,l,s); break; case“ _hjGet”:l = m.get(f); h(); break; case“ _hjRemove”:m.remove(f)}}}}函数h (a){try {var b = JSON.parse(a.data); b.key && k [b.key] && k [b.key] .parseCommand(b,a.origin)} catch(c){return null} } var q; try {q = {cookie:{get:function(a){return(a = RegExp(“(?:^ |;)” + a +“ =([^;] )”)。 exec(document.cookie))?a [1]:void 0},set:function(a,b,c){document.cookie = a +“ =” + b +“; path = /; expires =” + c。 toUTCString()},删除:function(a){document.cookie = a +“ =; expires = Tue,1979年3月13日,世界标准时间; path = /;”}},localStorage:g(localStorage),sessionStorage :g(sessionStorage)}} catch(t){return} var k = {_ hjOptOut:new d(“ cookie”,[“ ”],[“ https://www.hotjar.com”,“ https://local.hotjar.com “,” http://local.hotjar.com“,” https://insights-staging.hotjar.com“, “ http://insights-staging.hotjar.com”!,!0),grant_consent:new d(“ cookie”,[“ ”],[“ ”],!! 1),screenshot_retake:new d(“ localStorage “,[” “],[” “] ,! 1),screenshot_active_retake:new d(” sessionStorage“,[” “],[” *“] ,! 1) }; window.addEventListener?window.addEventListener(“ message”,h,!1):window.attachEvent(“ onmessage”,h)})();
答案 0 :(得分:1)
要解决弹出问题,只需在加载页面后检查是否有可用的弹出窗口。如果是,则单击该按钮。希望获得帮助。
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})