Question

在此网站上的一些同事的帮助下，我已经对房地产数据进行了网络抓取。

它工作正常，但是在爬到第6/7页或其他页面后，弹出了一个典型的cookie警告cookie，似乎破坏了我在CSV文件中的输出。

有没有办法处理弹出窗口？

from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()

browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)

def jaap_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
        #browser.delete_all_cookies()
        browser.get(url)
        #session = requests.Session()
        #res1 = session.post(url, post_data)
        #res2 = session.get(url1)
        time.sleep(15)
        #input('Press Enter after bypassing Captcha')
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        info = soup.find_all('div', {'class':'property-info'})
        inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
    # Make empty lists with header lines
    outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
    outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]

    for huis in info:
        street = huis.find('h2')
        street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])  
        address = huis.find('div')
        address = address.find('div').text.strip()
        price = huis.find('div', {'class': 'price-info'})
        price = price.find('div').text.strip()
        price = re.findall(r'\d', price)
        price = ''.join(price)
        pricetag = huis.find('div', {'class': 'property-price'})
        pricetag = pricetag.find('span').text.strip()
        outputlist_l1.append([street, address, price, pricetag])

    for items in inside:
        #browser.delete_all_cookies()
        href = items.get('href')
        url1 = href.format(page)
        browser.get(url1)
        kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
        details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
        try:
            tr = details[0].find_all ('td', {'class': 'value'})
        except IndexError:
            size_space = 'Unknown'

        for inhoud in tr:
            soort = tr[0].get_text(separator='\n', strip=True)
            bouwjaar = tr[1].get_text(separator='\n', strip=True)
            woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
            inhoud = tr[3].get_text(separator='\n', strip=True)
            perceel = tr[4].get_text(separator='\n', strip=True)
        l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
        outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])

    page += 1    
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]

# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()

jaap_spider(15)

网站中的cookie脚本：

（function（）{function g（a）{return {get：function（b）{var c = JSON.parse（a.getItem（b））; return！c || Date.parse（c.expires ）<=（新日期）.getTime（）？（a.removeItem（b），空）：c.value}，set：function（b，c，d）{c = {value：c，expires：d。 toUTCString（）}; a.setItem（b，JSON.stringify（c））}，删除：function（b）{a.removeItem（b）}}}函数d（a，b，c，d）{this。 parseCommand = function（e，g）{function h（）{var a = JSON.stringify（{messageId：k，value：l ||！1}）; window.parent.postMessage（a，“ ”） } var m = q [a]，n = e.action，p = e.key，k = e.messageId，f = e.siteId，f = d？p：p +“：” + f，l = e.value，r = e.expiresMinutes || 1440 （e.expiresDays || 365），s = function（）{var a = new Date; a.setTime（a.getTime（） + 6E4 * r）;返回a}（）; if（！function（）{var a = {_ hjSet：c，_hjGet：b，_hjRemove：c} [n] || [];返回0 <= a.indexOf （“ ”）|| 0 <= a.indexOf（g）}（））抛出错误（“键上不允许使用命令“ + n +”：“ + p）; switch（n）{case” _hjSet“ ：m.set（f，l，s）; break; case“ _hjGet”：l = m.get（f）; h（）; break; case“ _hjRemove”：m.remove（f）}}}}函数h （a）{try {var b = JSON.parse（a.data）; b.key && k [b.key] && k [b.key] .parseCommand（b，a.origin）} catch（c）{return null} } var q; try {q = {cookie：{get：function（a）{return（a = RegExp（“（？：^ |;）” + a +“ =（[^;] ）”）。 exec（document.cookie））？a [1]：void 0}，set：function（a，b，c）{document.cookie = a +“ =” + b +“; path = /; expires =” + c。 toUTCString（）}，删除：function（a）{document.cookie = a +“ =; expires = Tue，1979年3月13日，世界标准时间； path = /;”}}，localStorage：g（localStorage），sessionStorage ：g（sessionStorage）}} catch（t）{return} var k = {_ hjOptOut：new d（“ cookie”，[“ ”]，[“ https://www.hotjar.com”，“ https://local.hotjar.com “，” http://local.hotjar.com“，” https://insights-staging.hotjar.com“， “ http://insights-staging.hotjar.com”！，！0），grant_consent：new d（“ cookie”，[“ ”]，[“ ”]，!! 1），screenshot_retake：new d（“ localStorage “，[” “]，[” “] ,! 1），screenshot_active_retake：new d（” sessionStorage“，[” “]，[” *“] ,! 1） }; window.addEventListener？window.addEventListener（“ message”，h，！1）：window.attachEvent（“ onmessage”，h）}）（）;

Answer 1

要解决弹出问题，只需在加载页面后检查是否有可用的弹出窗口。如果是，则单击该按钮。希望获得帮助。

page = 1
while page <= max_pages:
  url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
  browser.get(url)
  time.sleep(10)
  #Check here if there popup available
  if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
        browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
        time.sleep(5)
    #input('Press Enter after bypassing Captcha')
  soup = BeautifulSoup(browser.page_source, 'html.parser')
  info = soup.find_all('div', {'class':'property-info'})
  inside = soup.find_all('a', {'class': 'property-inner'},{'href'})

第6/7页之后处理Cookie弹出窗口

1 个答案: