我写了一个用于抓取Foursquare的爬虫程序。问题是我可以从中获取30个餐厅名称,但不能从中读取分页数据。
我尝试了正常的刮削,然后也使用了硒,但在无头浏览器中,它不起作用,有时也无法单击。 我如何读取此页上的所有90个名称。如果可能的话,不使用硒。
import scrapy
from time import sleep
from selenium import webdriver
from scrapy.selector import Selector
from Crawlers import settings
from selenium.webdriver.common.by import By
class Foursquare(scrapy.Spider):
name = "learningexercise5"
fs_base_url = 'https://foursquare.com'
start_urls = ['https://foursquare.com/']
chromedriver_path = settings.chromedriver_path
def parse(self, response):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(self.chromedriver_path, chrome_options=options)
urls = ['https://foursquare.com/explore?cat=food&mode=url&near=New%20York%2C%20NY%2C%20United%20States&nearGeoId=72057594043056517']
for url in urls:
driver.get(url)
sleep(30)
x = driver.find_element(By.XPATH, '//button[@class="blueButton"]')
driver.execute_script("arguments[0].click();", x)
data = driver.page_source
scrapy_html_response = Selector(text=data)
data_row = scrapy_html_response.xpath(
'//div[@id="results"]/ul/li/div[@class="contentHolder"]/div[@class="infoCol"]/div[@class="venueBlock"]/div[@class="venueDetails"]')
for row in data_row:
name = row.css('.venueName').xpath('h2/a/text()').get()
location = row.css('.venueAddress::text').get()
print(name, location)