以下Scrapy crawlspider正常工作,除了url的输出(response.url):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'newstl'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/agencies/']
rules = [
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/.*/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stltoday1.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
#Extract article information
name = response.xpath('//th[@scope="row"]/text()').extract()
position = response.xpath('//th[@scope="row"]/following-sibling::*[1]/text()').extract()
salary = response.xpath('//th[@scope="row"]/following-sibling::*[2]/text()').extract()
hiredate = response.xpath('//th[@scope="row"]/following-sibling::*[3]/text()').extract()
url = response.url
for item in zip(name,position, salary, hiredate, url):
scraped_info = {
'url' : item[4],
'name' : item[0],
'position' : item[1],
'salary' : item[2],
'hiredate' : item[3]
}
yield scraped_info
输出在CSV的每一行中显示URL的1个字符。有没有办法让它重复每条记录的整个网址?
答案 0 :(得分:2)
你不应该压缩url
,只需直接设置:
url = response.url
for item in zip(name, position, salary, hiredate):
yield {
'url' : url,
'name' : item[0],
'position' : item[1],
'salary' : item[2],
'hiredate' : item[3]
}
并且,不是遍历整个树多次,而是迭代结果行并从每个项的上下文中获取所需信息:
for row in response.xpath('//th[@scope="row"]'):
yield {
"url": url,
"name": row.xpath('./text()').extract_first(),
"position": row.xpath('./following-sibling::*[1]/text()').extract_first(),
"salary": row.xpath('./following-sibling::*[2]/text()').extract_first(),
"hiredate": row.xpath('./following-sibling::*[3]/text()').extract_first(),
}