我是Python的新手。我正在尝试使用scrapy打印(并保存)网站中的所有博客文章。我希望蜘蛛只在主要内容部分中抓取。这是我的代码
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from people.items import PeopleCommentItem
class people(CrawlSpider):
name="people"
allowed_domains=["http://blog.sina.com.cn/"]
start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
rules=[Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)), callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="articalContent "]//a/@href')))
]
def parse(self,response):
hxs=HtmlXPathSelector(response)
print hxs.select('//div[@class="articalContent "]//a/text()').extract()
之后没有打印:
DEBUG: Crawled (200) <GET http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html> (referer: None)
ScrapyDeprecationWarning: scrapy.selector.HtmlXPathSelector is deprecated, instantiate scrapy.Selector instead.
hxs=HtmlXPathSelector(response)
ScrapyDeprecationWarning: Call to deprecated function select. Use .xpath() instead.
titles= hxs.select('//div[@class="articalContent "]//a/text()').extract()
2015-03-09 15:46:47-0700 [people] INFO: Closing spider (finished)
有人可以告知错误吗?
谢谢!
答案 0 :(得分:2)
我在这方面取得了一些成功:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
class people(CrawlSpider):
name="people"
allowed_domains=["http://blog.sina.com.cn/"]
start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
rules=(Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)), callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[contains(@class, "articalContent")]'))),
)
def parse(self,response):
links = Selector(text=response.body).xpath('//div[contains(@class, "articalContent")]//a//text()')
for link in links:
print link.extract()