我在显示我想要的项目时遇到问题。我的代码如下:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import request
from scrapy.selector import HtmlXPathSelector
from texashealth.items import TexashealthItem
class texashealthspider(CrawlSpider):
name="texashealth"
allowed_domains=['jobs.texashealth.org']
start_urls=['http://jobs.texashealth.org/search/?&q=&title=Filter%3A%20title&facility=Filter%3A%20facility&location=Filter%3A%20city&date=Filter%3A%20date']
rules=(
Rule(SgmlLinkExtractor(allow=("search/",)), callback="parse_health", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse_health(self, response):
hxs=HtmlXPathSelector(response)
titles=hxs.select('//tbody/tr/td')
items = []
for titles in titles:
item=TexashealthItem()
item['title']=titles.select('span[@class="jobTitle"]/a/text()').extract()
item['link']=titles.select('span[@class="jobTitle"]/a/@href').extract()
item['shifttype']=titles.select('span[@class="jobShiftType"]/text()').extract()
item['location']=titles.select('span[@class="jobLocation"]/text()').extract()
items.append(item)
print items
return items
并且正在显示的输出在json格式中如下所示:
[
TexashealthItem(location=[], link=[u'/job/Fort-Worth-ULTRASONOGRAPHER-II-Job-TX-76101/31553900/'], shifttype=[], title=[u'ULTRASONOGRAPHER II Job']),
TexashealthItem(location=[], link=[], shifttype=[u'Texas Health Fort Worth'], title=[]),
TexashealthItem(location=[u'Fort Worth, TX, US'], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[u'/job/Kaufman-RN-Acute-ICU-Full-Time-Kaufman-Job-TX-75142/35466900/'], shifttype=[], title=[u'RN--Telemetry--Full Time--Kaufman Job']),
TexashealthItem(location=[], link=[], shifttype=[u'Texas Health Kaufman'], title=[]),
TexashealthItem(location=[u'Kaufman, TX, US'], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[u'/job/Fort-Worth-NURSE-PRACTITIONER-Occ-Med-Full-Time-Alliance-Job-TX-76101/35465400/'], shifttype=[], title=[u'NURSE PRACTITIONER-Occ Med-Full Time-Alliance Job']),
TexashealthItem(location=[], link=[], shifttype=[u'Texas Health Alliance'], title=[]),
TexashealthItem(location=[u'Fort Worth, TX, US'], link=[], shifttype=[], title=[]),
TexashealthItem(location=[], link=[], shifttype=[], title=[])
]
如上所示,项目的参数以不同的间隔显示,即,它在一行中显示标题和链接,其余的输出在其他单独的行中显示。
我可以获得一个解决方案,以便我可以一次性显示所有参数吗?
感谢您的帮助
答案 0 :(得分:1)
您应该循环表行 - tr
元素,而不是表格单元格 - td
元素。
我建议您使用hxs.select('//table[@id="searchresults"]/tbody/tr')
,然后在每次循环迭代中使用.//span...
titles=hxs.select('//table[@id="searchresults"]/tbody/tr')
items = []
for titles in titles:
item['title']=titles.select('.//span[@class="jobTitle"]/a/text()').extract()
item['link']=titles.select('.//span[@class="jobTitle"]/a/@href').extract()
item['shifttype']=titles.select('.//span[@class="jobShiftType"]/text()').extract()
item['location']=titles.select('.//span[@class="jobLocation"]/text()').extract()
items.append(item)
return items