Question

我想抓取新闻网站，我需要收集链接。

这是我的代码：

import scrapy
import codecs 
import re
from urlparse import urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import Request
from scrapy.selector import HtmlXPathSelector
from hurriyet.items import HurriyetItem

class hurriyet_spider(CrawlSpider):
    name = 'hurriyet'
    start_domains = ['hurriyet.com.tr']
    start_urls = ['http://www.hurriyet.com.tr/']
    rules = (Rule(SgmlLinkExtractor(allow=()),'parse',follow=True),)
    def start_requests(self):
            return [Request(url, meta={'domain': domain}, callback=self.parse) for url, domain in zip(self.start_urls, self.start_domains)]

    def parse_start_url(self, response):
            return self.parse(response)

    def parse(self, response):
        links = response.xpath('//a/@href').extract()
        for link in links:
            if("http://" in link):
                if("hurriyet.com" in link):
                    if(".asp" in link):
                        start_urls.append(link)
                    else:
                        print link
                        return self.parse(link)
                start_urls.append(link)
            else:
                print link
                return self.parse(link)

    def news_downloads(self, response):
        image = HurriyetItem()
        image['source'] = link
        image['title'] = response.xpath("//h1[@class = 'title selectionShareable'] | //h1[@itemprop = 'name']/text()").extract()
        image['body'] = response.xpath("//div[@class = 'detailSpot']").extract()
        image['body2'] = response.xpath("//div[@class = 'ctx_content'] ").extract()
        return image

但是没有工作并且说：

links = response.xpath（＆＃39; // a / @ href＆＃39;）。extract（） exceptions.AttributeError：＆＃39; unicode＆＃39;对象没有属性＆＃39; xpath＆＃39;

我尝试删除＆＃34; .extract（）＆＃34;但它没有用。

Answer 1

这里有两个选项：使用crawl-spider或使用base-spider，好像你在这里混合了一切。

解决方案是，

您可以使用BaseSpider

from scrpy import Spider
class hurriyet_spider(Spider): 
    name = 'hurriyet' 
    allowed_domains = ['hurriyet.com.tr'] 
    start_urls = ['http://www.hurriyet.com.tr/']

    def parse(self, response):
        links = response.xpath('//a/@href').extract()
        for link in links:
            # your code here

或Crawl-Spider

class hurriyet_spider(CrawlSpider):
    name = 'hurriyet' 
    allowed_domains = ['hurriyet.com.tr'] 
    start_urls = ['http://www.hurriyet.com.tr/']
    rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_item',follow=True),)

    def parse_item(self, response):
        links = response.xpath('//a/@href').extract()
        for link in links:
            # your code here

修改

return self.parse(link)将是返回代码的可能错误，因为您将unicode对象作为参数提供给parse()，并且您正在访问的正文中xpath() unicode对象。由于xpath()没有名为unicode-object的函数，因此您收到此错误。您必须提供response object / scrapy selector-object而不是unicode才能使用xpath()功能。

Scrapy Xpath AttributeError：exceptions.AttributeError：＆＃39; unicode＆＃39;对象没有属性＆＃39; xpath＆＃39;

1 个答案: