Scrapy:仅从具有meta noindex的页面解析

时间:2014-02-12 00:58:31

标签: python web-crawler scrapy

我正在尝试抓取网站并仅从具有元无索引的网页进行解析。 发生的事情是爬虫爬行第一级,但完成第一页。它似乎没有遵循链接。 以下是我的代码:

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from wallspider.items import Website


class mydomainSpider(CrawlSpider):
    name = "0resultsTest"
    allowed_domains = ["www.mydomain.com"]
    start_urls = ["http://www.mydomain.com/cp/3944"]

    rules = (
    Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
    )

    def _response_downloaded(self, response):
        sel = HtmlXPathSelector(response)
        if sel.xpath('//meta[@content="noindex"]'):
            return super(mydomainSpider, self).parse_items(response)
        return

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//html')
        items = []

        for site in sites:
            item = Website()
            item['url'] = response.url
            item['referer'] = response.request.headers.get('Referer')
            item['title'] = site.xpath('/html/head/title/text()').extract()
            item['robots'] = site.select('//meta[@name="robots"]/@content').extract()
            items.append(item)

        yield items

2 个答案:

答案 0 :(得分:1)

原始的_response_downloaded调用_parse_response函数,除了调用callback函数之外,还遵循链接,来自scrapy代码:

def _parse_response(self, response, callback, cb_kwargs, follow=True):
    if callback:
        cb_res = callback(response, **cb_kwargs) or ()
        cb_res = self.process_results(response, cb_res)
        for requests_or_item in iterate_spider_output(cb_res):
            yield requests_or_item

    if follow and self._follow_links:
        for request_or_item in self._requests_to_follow(response):
            yield request_or_item

你可以添加以下链接部分,虽然我认为这不是最佳方式(领先_可能意味着这一点),为什么不在{{1}开始检查meta 1}}功能?如果你不想重复这个测试,甚至可以写一个python装饰器。

答案 1 :(得分:0)

我相信在我的parse_items开头检查meta,因为@Guy Gavriely建议将是我的最佳选择。我将测试下面的代码以查看。

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from wallspider.items import Website


class mydomainSpider(CrawlSpider):
    name = "0resultsTest"
    allowed_domains = ["www.mydomain.com"]
    start_urls = ["http://www.mydomain.com/cp/3944"]

    rules = (
    Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
    )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//html')
        items = []

        if hxs.xpath('//meta[@content="noindex"]'):
            for site in sites:
                item = Website()
                item['url'] = response.url
                item['referer'] = response.request.headers.get('Referer')
                item['title'] = site.xpath('/html/head/title/text()').extract()
                item['robots'] = site.select('//meta[@name="robots"]/@content').extract()
                items.append(item)

            yield items

工作代码更新,我需要返回项而不是yield:

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from wallspider.items import Website


class mydomainSpider(CrawlSpider):
    name = "0resultsTest"
    allowed_domains = ["www.mydomain.com"]
    start_urls = ["http://www.mydomain.com/cp/3944"]

    rules = (
    Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
    )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//html')
        items = []

        if hxs.xpath('//meta[@content="noindex"]'):
            for site in sites:
                item = Website()
                item['url'] = response.url
                item['referer'] = response.request.headers.get('Referer')
                item['title'] = site.xpath('/html/head/title/text()').extract()
                item['robots'] = site.select('//meta[@name="robots"]/@content').extract()
                items.append(item)

            return items