我已经写了一个scrapy项目来从Congress.gov网站上获取一些数据。最初,我希望收集所有账单上的数据。我的代码运行,并下载了我想要的数据,但只有大约1/2的账单。所以我开始排除故障。我打开了设置中的autothrottle,并包含了太多请求的中间件代码。然后,我将搜索条件限制在一个特定的国会(第97位),仅用于参议院的账单,并重新运行代码。它下载了大部分账单,但还有一些账单丢失了。然后我试图抓住丢失的页面。特别是,我试图抓第32页,我能够成功刮。那么,当我使用递归代码时,为什么不会刮掉所有页面?
任何人都可以帮我弄清问题是什么吗?这是我用来从第97届国会的所有法案中获取信息的代码:
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from senatescraper.items import senatescraperSampleItem
from scrapy.http.request import Request
class SenatebotSpider(BaseSpider):
name = 'recursivesenatetablebot2'
allowed_domains = ['www.congress.gov']
def start_requests(self):
baseurl = "https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%2C%22chamber%22%3A%22Senate%22%2C%22congress%22%3A%5B%2297%22%5D%2C%22type%22%3A%5B%22bills%22%5D%7D&page="
for i in xrange(1,32):
beginurl= baseurl + `i`
yield Request(beginurl, self.parse_bills)
def parse_bills(self, response):
sel= Selector(response)
bills=sel.xpath("//span[5][@class='result-item']")
for bill in bills:
bill_url=bill.css("span.result-item a::attr(href)").extract()[0]
yield Request(url=bill_url, callback=self.parse_items)
def parse_items(self, response):
sel=Selector(response)
rows=sel.css('table.item_table tbody tr')
items=[]
for row in rows:
item = senatescraperSampleItem()
item['bill']=response.css('h1.legDetail::text').extract()
item['dates']=row.xpath('./td[1]/text()').extract()[0]
item['actions']=row.css('td.actions::text').extract()
item['congress']=response.css('h2.primary::text').extract()
items.append(item)
return items
这是我过去用第97届国会的过滤器搜索第32页的代码,这些代码仅来自参议院:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from startingover.items import startingoverSampleItem
class DebuggingSpider(BaseSpider):
name = 'debugging'
allowed_domains = ['www.congress.gov']
def start_requests(self):
yield scrapy.Request('https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%2C%22chamber%22%3A%22Senate%22%2C%22congress%22%3A%5B%2297%22%5D%2C%22type%22%3A%5B%22bills%22%5D%7D&page=32', self.parse_page)
def parse_page(self, response):
sel= Selector(response)
bills=sel.xpath("//span[5][@class='result-item']")
for bill in bills:
bill_url=bill.css("span.result-item a::attr(href)").extract()[0]
yield Request(url=bill_url, callback=self.parse_items)
def parse_items(self, response):
sel=Selector(response)
rows=sel.css('table.item_table tbody tr')
items=[]
for row in rows:
item = startingoverSampleItem()
item['bill']=response.css('h1.legDetail::text').extract()
item['dates']=row.xpath('./td[1]/text()').extract()[0]
item['actions']=row.css('td.actions::text').extract()
item['congress']=response.css('h2.primary::text').extract()
items.append(item)
return items
我的项目:
from scrapy.item import Item, Field
class senatescraperSampleItem(Item):
bill=Field()
actions=Field(serializer=str)
congress=Field(serializer=str)
dates=Field()
答案 0 :(得分:0)
我认为你没有看到你想要废弃的一半东西,因为你没有照顾解决相关网址。使用response.urljoin
可以解决问题。
yield Request(url=response.urljoin(bill_url), callback=self.parse_items)
您可能会遇到此异常:
2018-01-30 17:27:13 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%2C%22chamber%22%3A%2
2Senate%22%2C%22congress%22%3A%5B%2297%22%5D%2C%22type%22%3A%5B%22bills%22%5D%7D&page=5> (referer: None)
Traceback (most recent call last):
File "/home/jorge/.virtualenvs/scrapy/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/home/jorge/.virtualenvs/scrapy/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 30, in process_spider_output
for x in result:
File "/home/jorge/.virtualenvs/scrapy/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/jorge/.virtualenvs/scrapy/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/jorge/.virtualenvs/scrapy/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/tmp/stackoverflow/senatescraper/senatescraper/spiders/senatespider.py", line 25, in parse_bills
bill_url = bill.css("span.result-item a::attr(href)").extract()[0]
IndexError: list index out of range
为确保您从带有文本“所有操作”的元素中获取URL,并且没有捕获该元素之前可能存在的任何奇怪内容,您应该按如下方式组合xpath查询:
def parse_bills(self, response):
sel = Selector(response)
bills = sel.xpath(
'//a[contains(@href, "all-actions")]/@href').extract()
for bill in bills:
yield Request(
url=response.urljoin(bill),
callback=self.parse_items,
dont_filter=True)
注意dont_filter=True
参数,我添加了它,因为scrapy正在过滤我已经抓取过的链接(这是默认配置)。如果您以不同的方式管理重复链接的过滤,则可以将其删除。
当您获得例外时,您始终可以将它们包裹在try
和except
并在except
块中启动the debugging shell of scrapy,它会帮助您检查响应并查看发生了什么事。
答案 1 :(得分:0)
我对我的代码进行了以下更改,它运行良好:
def parse_bills(self, response):
bills=Selector(response)
billlinks=bills.xpath('//a[contains(@href,"/all-actions")]/@href')
for link in billlinks:
urllink=link.extract()
yield Request(url=urllink, callback=self.parse_items)