我的目标是从县公共网站抓取税收数据。当使用scrapy和selenium时,我能够收集包裹ID,地址和所有者名称,但是我的输出为表中的元素返回了null,而当我禁用javascript时,该表消失了。
来自特定县网站的每个搜索结果均会返回包含其税收详细信息链接的属性列表。我正尝试在这些链接中抓取数据。
收集land_value和total_appr的最佳方法是什么?同样,当我从chrome工具中禁用javascript时,这些值是不可见的,那么我是否必须在另一个for循环中使用selenium?我该怎么做呢?谢谢
from abc import ABC
from time import sleep
import scrapy
from scrapy import Spider
from scrapy.spiders import CrawlSpider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
class PropertiesSpider(Spider, ABC):
name = 'properties'
allowed_domains = ['gwinnettassessor.manatron.com/IWantTo/PropertyGISSearch.aspx']
def start_requests(self):
self.driver = webdriver.Chrome('/Users/wenzeljoe/CondaProjects/chromedriver')
self.driver.get('http://gwinnettassessor.manatron.com/IWantTo/PropertyGISSearch.aspx')
search_bar = self.driver.find_element_by_xpath("//table/tbody/tr/td/div/input")
search_bar.send_keys("smith")
button = self.driver.find_element_by_xpath("//button/span")
button.click()
sel = Selector(text=self.driver.page_source)
properties = sel.xpath("//ul[2]/li/a/@href").extract()
for property in properties:
url = 'http://gwinnettassessor.manatron.com/IWantTo/PropertyGISSearch.aspx' + property
yield Request(url, callback=self.parse_property)
def parse_property(self, response):
parcel_id = response.xpath(
'//table[@class="ui-widget-content ui-table generalinfo"]/tbody/tr[2]/td/text()').extract_first()
address = response.xpath(
'//table[@class="ui-widget-content ui-table generalinfo"]/tbody/tr[4]/td/text()').extract_first()
name = response.xpath(
'//table[@class="ui-widget-content ui-table generalinfo"]/tbody/tr/td/text()').extract_first()
land_value = response.xpath("//table[@class='ui-table']/tbody/tr[3]/td/text()").extract_first()
total_appr = response.xpath("//td[@class='ui-widget String ui-state-default']/text()").extract_first()
yield {
'parcel_id': parcel_id,
'address': address,
'owner_name': name,
'land_value': land_value,
'total_appraisal': total_appr
}