我目前正在使用Scrapy和Selenium的组合来快速搜索USPTO TradeMark数据库。这些页面附加了会话令牌。
我尝试过的事情read about似乎没有足够集成 - 这意味着虽然Selenium可以将找到的网址传递给scrapy,但scrapy会向该网页发出新请求,从而使令牌无效,所以我需要Selenium将HTML提供给scrapy进行解析。这可能吗?
# -*- coding: utf-8 -*-
# from terminal run: scrapy crawl trademarks -o items.csv -t csv
import time
import scrapy
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from selenium import webdriver
class TrademarkscrapeItem(scrapy.Item):
category = Field()
wordmark = Field()
registrant = Field()
registration_date = Field()
description = Field()
class TradeMarkSpider(CrawlSpider):
name = "trademarks"
allowed_domains = ["uspto.gov"]
start_urls = ['http://www.uspto.gov']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
# Navigate through the site to get to the page I want to scrape
self.driver.get(response.url)
next = self.driver.find_element_by_xpath("//*[@id='menu-84852-1']/a")
next.click()
time.sleep(2) # Let any js render in page
next = self.driver.find_element_by_xpath("//*[@id='content']/article/ul[1]/li[1]/article/h4/a")
next.click()
time.sleep(2)
# How to get this next part to point at Selenium-delivered HTML?
TradeDict = {}
SelectXpath = Selector(SeleniumHTML).xpath #SeleniumHTML is psuedoCode
TradeDict['description'] = SelectXpath("//*[@id='content']/article/div/p/text()").extract()
self.driver.close()
return TradeDict