这是我的Scrapy
蜘蛛。我试图从网上搜集一些数据。但我不知道如何强制Scrapy
递归地跟踪链接。我的错误在哪里?
import re
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from YellowPagesOfMoldova.items import YellowpagesofmoldovaItem
from scrapy.item import Item
class YellowSpider(CrawlSpider):
name = 'yellow'
allowed_domains = ['yellowpages.md']
start_urls = [ 'http://www.yellowpages.md/eng/companies/info/8939-arc-publishing-house']
rules = (
Rule(SgmlLinkExtractor(allow=('eng.+')), follow=True),
)
def parse(self, response):
sel = Selector(response)
i = YellowpagesofmoldovaItem()
i['url'] = response.url
i['locality'] = sel.xpath("//tr[3]/td/p[1]/span[1]/text()").extract()
i['title'] = sel.xpath('//title/text()').extract()
i['title2'] = sel.xpath("//td/h1/text()").extract()
i['website'] = sel.xpath("//p[2]/a/text()").extract()
i['activity'] = sel.xpath("//tbody/tr[4]/td/p/text()").extract()
i['street'] = sel.xpath("//tr/td/p[1]/span[2]/text()").extract()
return i
感谢。
我解决了这个问题。现在它完美无缺。它看起来如此:
import re
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from YellowPagesOfMoldova.items import YellowpagesofmoldovaItem
from scrapy.item import Item
class YellowSpider(CrawlSpider):
name = 'yellow'
allowed_domains = ['yellowpages.md']
start_urls = [ 'http://www.yellowpages.md/eng/companies/info/8939-arc-publishing-house']
rules = (
Rule(SgmlLinkExtractor(allow=('eng.+')),callback='parse_items', follow=True),
)
def parse_items(self, response):
sel = Selector(response)
i = YellowpagesofmoldovaItem()
i['url'] = response.url
i['locality'] = sel.xpath("//tr[3]/td/p[1]/span[1]/text()").extract()
i['title'] = sel.xpath('//title/text()').extract()
i['title2'] = sel.xpath("//td/h1/text()").extract()
i['website'] = sel.xpath("//p[2]/a/text()").extract()
i['activity'] = sel.xpath("//tbody/tr[4]/td/p/text()").extract()
i['street'] = sel.xpath("//tr/td/p[1]/span[2]/text()").extract()
return i
答案 0 :(得分:0)
不应该覆盖CrawlSpider的parse
方法,因为这是所有“魔法”发生的地方。 (见warning about parse
here)。
将def parse
更改为def parse_page
并在您的规则中引用此回调:Rule(SgmlLinkExtractor(allow=('eng.+')), callback='parse_page', follow=True),