我想让Scrapy读取一个HTML表格并返回一张CSV,其中包含从表格中提取的所需信息。
每个项目字段对应于tr中的第th个元素。我想我的xpath遇到了麻烦,但我不确定我是否也正确使用了选择器。
。 。
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scraper_app.items import GenomeCanadaGrants
class GCSpider(Spider):
"""Spider for Genome Canada Awards page."""
name = 'GCSpider'
allowed_domains = ["http://genomereports.ca"]
start_urls= ["http://genomereports.ca/section.php?Action=List2&Lang=En&addnew=&Report=consolidated_commitments.php&Report_Text=Funding+Commitments&Nav=Section&ID=3&Login=&Password=&Consolidated_Centre=ALL&Consolidated_Category=ALL&Consolidated_Sector=ALL&Consolidated_Competition=ALL&Consolidated_FY=ALL&Consolidated_Status=ALL"]
def parse(self, response):
sel = Selector(response)
grants = sel.xpath('//html/body/table[1]/tr[]')
items = []
for response in grants:
item = GenomeCanadaGrants()
item['Province'] = response.xpath('./th[1]/text()').extract() # Column Header: CENTRE
item['Sector'] = response.xpath('./th[2]/text()').extract() # Column Header: SECTOR
item['Fund'] = response.xpath('./th[3]/text()').extract() # Column Header: PROGRAM & Fiscal Yr Awarded
item['Date'] = response.xpath('./th[3]/text()').re('\d+\d-\d+\d') # Column Header: PROGRAM & Fiscal Yr Awarded
item['Status'] = response.xpath('./th[4]/text()').extract # Column Header: STATUS
item['Principal_Investigator'] = response.xpath('./th[5]/text()').extract() # Column Header: PROJECT LEADER(S)
item['Project_Title'] = response.xpath("./th[6]/text()").extract # Column Header: PROJECT TITLE
item['Additional_Information'] = response.xpath("./th[7]//a[@href='url']").extract # Link to PDF with Project Details
item['Amount'] = response.xpath('./th[8]/text()').extrac # Column Header: APPROVED BUDGET
item['GC_Contribution'] = response.xpath('./th[9]/text()').extract # Column Header: GC CONTRIBUTION
items.append(item)
return items
答案 0 :(得分:0)
我认为只是您没有正确找到所需的表格 - 它位于div
id="content_frame"
内。
以下是适用于我的示例代码:
def parse(self, response):
for row in response.css('div#content_frame table tr'):
try:
cells = row.xpath(".//th//text()")
first_cell = cells[0].extract()
print(first_cell)
except IndexError:
continue # it's just to skip rows not containing CENTRE value
打印第一列的内容:
CENTRE
Genome British Columbia
Genome British Columbia
Genome British Columbia
Genome Alberta
Genome Alberta
Genome Alberta
...