Scrapy从表中的链接获取数据

时间:2016-05-16 15:43:25

标签: python scrapy scrapy-spider

我正在尝试从html表中抓取数据Texas Death Row

我可以使用下面的蜘蛛脚本从表中提取现有数据:

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

from texasdeath.items import DeathItem

class DeathSpider(BaseSpider):
   name = "death"
   allowed_domains = ["tdcj.state.tx.us"]
   start_urls = [
       "https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
   ]



   def parse(self, response):
       hxs = HtmlXPathSelector(response)
       sites = hxs.select('//table/tbody/tr')
       for site in sites:
           item = DeathItem()
           item['firstName'] = site.select('td[5]/text()').extract()
           item['lastName'] = site.select('td[4]/text()').extract()
           item['Age'] = site.select('td[7]/text()').extract()
           item['Date'] = site.select('td[8]/text()').extract()
           item['Race'] = site.select('td[9]/text()').extract()
           item['County'] = site.select('td[10]/text()').extract()
           yield item

问题是表格中还有我想要调用的链接,并从链接中获取数据以附加到我的项目中。

这里的Scrapy教程Scrapy Tutorial似乎有一个如何从目录中提取数据的指南。但我无法弄清楚如何从主页面获取数据以及从表格中的链接返回数据。

1 个答案:

答案 0 :(得分:1)

而不是产生一个项目,yield一个Request并传递itemmeta内。文档here中介绍了这一点。

蜘蛛的示例实施将遵循"罪犯信息"链接,如果它导致罪犯"细节"页面(有时会导致图像 - 在这种情况下,蜘蛛会输出它当前的图像):

from urlparse import urljoin

import scrapy


class DeathItem(scrapy.Item):
    firstName = scrapy.Field()
    lastName = scrapy.Field()
    Age = scrapy.Field()
    Date = scrapy.Field()
    Race = scrapy.Field()
    County = scrapy.Field()
    Gender = scrapy.Field()


class DeathSpider(scrapy.Spider):
    name = "death"
    allowed_domains = ["tdcj.state.tx.us"]
    start_urls = [
        "https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
    ]

    def parse(self, response):
        sites = response.xpath('//table/tbody/tr')
        for site in sites:
            item = DeathItem()

            item['firstName'] = site.xpath('td[5]/text()').extract()
            item['lastName'] = site.xpath('td[4]/text()').extract()
            item['Age'] = site.xpath('td[7]/text()').extract()
            item['Date'] = site.xpath('td[8]/text()').extract()
            item['Race'] = site.xpath('td[9]/text()').extract()
            item['County'] = site.xpath('td[10]/text()').extract()

            url = urljoin(response.url, site.xpath("td[2]/a/@href").extract_first())
            if url.endswith("html"):
                yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
            else:
                yield item

    def parse_details(self, response):
        item = response.meta["item"]
        item["Gender"] = response.xpath("//td[. = 'Gender']/following-sibling::td[1]/text()").extract()
        yield item