我希望我的请求对于那些经验丰富的Scrapy用户来说非常简单明了。
实质上,以下代码适用于根据第一页中的链接从第二页进行抓取。我想使用第二页中的链接将代码扩展到第3页。使用下面的代码,def parse_items
是登录页面(第1级),其中包含50个列表,并且代码设置为从50个链接中的每个链接递归删除。 def parse_listing_page
指定要从“列表页面”中删除哪些项目。在每个列表页面中,我希望我的脚本在返回“列表页面”然后返回登录页面之前,按照链接进入另一个页面并刮取一两个项目。
以下代码适用于2级递归抓取。如何使用下面的代码将其扩展为3?
from scrapy import log
from scrapy.log import ScrapyFileLogObserver
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import exampleItem
from scrapy.http import Request
import urlparse
logfile_info = open('example_INFOlog.txt', 'a')
logfile_error = open('example_ERRlog.txt', 'a')
log_observer_info = log.ScrapyFileLogObserver(logfile_info, level=log.INFO)
log_observer_error = log.ScrapyFileLogObserver(logfile_error, level=log.ERROR)
log_observer_info.start()
log_observer_error.start()
class MySpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com.au"]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//li[@class="nextLink"]',))
, callback="parse_items", follow=True),
)
def start_requests(self):
start_urls = reversed([
"http://www.example.com.au/1?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=100-to-200",
])
return[Request(url = start_url) for start_url in start_urls ]
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select("//h2")
items = []
for listings in listings:
item = exampleItem()
item ["title"] = listings.select("a/text()").extract()[0]
item ["link"] = listings.select("a/@href").extract()[0]
items.append(item)
url = "http://example.com.au%s" % item["link"]
yield Request(url=url, meta={'item':item},callback=self.parse_listing_page)
def parse_listing_page(self,response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item["item_1"] = hxs.select('#censored Xpath').extract()
item["item_2"] = hxs.select('#censored Xpath').extract()
item["item_3"] = hxs.select('#censored Xpath').extract()
item["item_4"] = hxs.select('#censored Xpath').extract()
return item
非常感谢
答案 0 :(得分:1)
这就是代码流程的工作方式。
调用Rule
类中的MySpider
构造函数开始。 Rule
构造函数的回调设置为parse_items
。 yield
末尾有一个parse_items
,可以将该函数递归到parse_listing_page
。如果您希望从parse_listing_page
递归到第三级,则Request
的收益率必须为parse_listing_page
。
答案 1 :(得分:1)
这是我更新的代码。下面的代码能够以适当的格式(已测试)提取counter_link
,但似乎使用else
语句,因此不会产生parse_listing_counter
。如果我删除if
和else
子句并强制代码回调parse_listing_counter
,则不会产生任何项目(甚至不会产生来自parse_items
或列表页面的项目)
我的代码中出错了什么?我也检查了XPath - 一切似乎都没问题。
from scrapy import log
from scrapy.log import ScrapyFileLogObserver
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import exampleItem
from scrapy.http import Request
import urlparse
logfile_info = open('example_INFOlog.txt', 'a')
logfile_error = open('example_ERRlog.txt', 'a')
log_observer_info = log.ScrapyFileLogObserver(logfile_info, level=log.INFO)
log_observer_error = log.ScrapyFileLogObserver(logfile_error, level=log.ERROR)
log_observer_info.start()
log_observer_error.start()
class MySpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com.au"]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//li[@class="nextLink"]',))
, callback="parse_items", follow=True),
)
def start_requests(self):
start_urls = reversed([
"http://www.example.com.au/1?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=10-to-100",
"http://www.example.com.au/2?new=true&list=100-to-200",
])
return[Request(url = start_url) for start_url in start_urls ]
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select("//h2")
items = []
for listings in listings:
item = exampleItem()
item ["title"] = listings.select("a/text()").extract()[0]
item ["link"] = listings.select("a/@href").extract()[0]
items.append(item)
url = "http://example.com.au%s" % item["link"]
yield Request(url=url, meta={'item':item},callback=self.parse_listing_page)
def parse_listing_page(self,response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item["item_1"] = hxs.select('#censored Xpath').extract()
item["item_2"] = hxs.select('#censored Xpath').extract()
item["item_3"] = hxs.select('#censored Xpath').extract()
item["item_4"] = hxs.select('#censored Xpath').extract()
item["counter_link"] = hxs.selext('#censored Xpath').extract()[0]
counter_link = response.meta.get('counter_link', None)
if counter_link:
url2 = "http://example.com.au%s" % item["counter_link"]
yield Request(url=url2, meta={'item':item},callback=self.parse_listing_counter)
else:
yield item
def parse_listing_counter(self,response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item["counter"] = hxs.select('#censored Xpath').extract()
return item