请指导我如何编写规则SgmlLinkExtractor
我很困惑,无法弄清楚英文文件
我想用很多页面抓取网页 规则是:
http://abctest.com/list.php?c=&&page=1
http://abctest.com/list.php?c=&&page=2
http://abctest.com/list.php?c=&&page=3 ...
这是我的代码:
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class Spider(CrawlSpider):
name = "find"
start_urls = ["http://abctest.com/list.php?c=&&page=1",]
#crawl 2 pages to test if the data is normal allow=('?c=&&page=/d+')
rules = [Rule(SgmlLinkExtractor(allow=('?c=&&page=2')),callback='parse_item',follow=True)]
#get the page1 item
def parse(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
#get the page2 item
def parse_item(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
答案 0 :(得分:1)
这里你真的不需要LinkExtractor
和CrawlSpider
- 只是常规Spider
。您需要的是定义start_requests()
方法并从中产生请求:
from scrapy import Request, Spider
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
URL = 'http://abctest.com/list.php?c=&&page={page}'
class Spider(Spider):
handle_httpstatus_list = [404]
name = "find"
def start_requests(self):
index = 1
while True:
yield Request(URL.format(page=index))
index +=1
def parse(self, response):
if response.status == 404:
raise CloseSpider("Met the page which doesn't exist")
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
请注意,这里的技巧是继续获取页面,直到我们遇到第一个响应404 - 找不到页面。这应该使它适用于任意数量的页面。