我只是想知道是否有一种方法可以防止Scrapy爬行start_urls
并允许或强制Selenium爬行start_urls
??
我的脚本当前看起来像这样。
class IndiegogoSpider(CrawlSpider):
name = 'indiegogo3'
allowed_domains = ['indiegogo.com']
start_urls = ['https://www.indiegogo.com/explore/all?project_type=all&project_timing=all&sort=trending']
def parse(self, response):
def get_time():
return time.strftime('%I:%M:%S %p')
if (response.status != 404):
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(firefox_options=options)
browser.get(self.start_urls[0])
show_more = WebDriverWait(browser, 3).until(
EC.element_to_be_clickable((By.XPATH, '//div[@class="text-center"]/a'))
)
while True:
try:
show_more.click()
except Exception:
break
wait = WebDriverWait(browser, 10).until(
EC.visibility_of_all_elements_located((By.XPATH, '//discoverable-card'))
)
hrefs = WebDriverWait(browser, 60).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[@class="discoverableCard"]/a'))
)
for href in hrefs:
browser.get(href.get_attribute('href'))
###############################
# #
# scrape individual pages #
# #
###############################
browser.close()
我希望做的是这样的事情。
class IndiegogoSpider(CrawlSpider):
name = 'indiegogo'
allowed_domains = ['indiegogo.com']
start_urls = get_links()
def get_links():
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(firefox_options=options)
browser.get('https://www.indiegogo.com/explore/all?project_type=all&project_timing=all&sort=trending')
show_more = WebDriverWait(browser, 60).until(
EC.element_to_be_clickable((By.XPATH, '//div[@class="text-center"]/a'))
)
while True:
try:
show_more.click()
except Exception:
break
hrefs = WebDriverWait(browser, 60).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[@class="discoverableCard"]/a'))
)
campaigns = []
for href in hrefs:
campaigns.append(href.get_attribute('href'))
browser.close()
return campaigns
############################################
# #
# use Selenium to retrieve responses #
# instead of Scrapy for start_requests #
# #
############################################
def start_requests(self, response):
换句话说,是否有一种方法可以强制蜘蛛使用Selenium Web Driver从start_urls
获取响应,而不是Scrapy通过start_requests
函数自动处理响应?
还是最好在parse
函数中将所有内容汇总在一起以实现此目的?