我想抓取新闻网站,我需要收集链接。
这是我的代码:
import scrapy
import codecs
import re
from urlparse import urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import Request
from scrapy.selector import HtmlXPathSelector
from hurriyet.items import HurriyetItem
class hurriyet_spider(CrawlSpider):
name = 'hurriyet'
start_domains = ['hurriyet.com.tr']
start_urls = ['http://www.hurriyet.com.tr/']
rules = (Rule(SgmlLinkExtractor(allow=()),'parse',follow=True),)
def start_requests(self):
return [Request(url, meta={'domain': domain}, callback=self.parse) for url, domain in zip(self.start_urls, self.start_domains)]
def parse_start_url(self, response):
return self.parse(response)
def parse(self, response):
links = response.xpath('//a/@href').extract()
for link in links:
if("http://" in link):
if("hurriyet.com" in link):
if(".asp" in link):
start_urls.append(link)
else:
print link
return self.parse(link)
start_urls.append(link)
else:
print link
return self.parse(link)
def news_downloads(self, response):
image = HurriyetItem()
image['source'] = link
image['title'] = response.xpath("//h1[@class = 'title selectionShareable'] | //h1[@itemprop = 'name']/text()").extract()
image['body'] = response.xpath("//div[@class = 'detailSpot']").extract()
image['body2'] = response.xpath("//div[@class = 'ctx_content'] ").extract()
return image
但是没有工作并且说:
links = response.xpath(' // a / @ href')。extract() exceptions.AttributeError:' unicode'对象没有属性' xpath'
我尝试删除" .extract()"但它没有用。
答案 0 :(得分:1)
这里有两个选项:使用crawl-spider
或使用base-spider
,好像你在这里混合了一切。
解决方案是,
您可以使用BaseSpider
from scrpy import Spider
class hurriyet_spider(Spider):
name = 'hurriyet'
allowed_domains = ['hurriyet.com.tr']
start_urls = ['http://www.hurriyet.com.tr/']
def parse(self, response):
links = response.xpath('//a/@href').extract()
for link in links:
# your code here
或Crawl-Spider
class hurriyet_spider(CrawlSpider):
name = 'hurriyet'
allowed_domains = ['hurriyet.com.tr']
start_urls = ['http://www.hurriyet.com.tr/']
rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_item',follow=True),)
def parse_item(self, response):
links = response.xpath('//a/@href').extract()
for link in links:
# your code here
修改强>
return self.parse(link)
将是返回代码的可能错误,因为您将unicode
对象作为参数提供给parse()
,并且您正在访问的正文中xpath()
unicode
对象。由于xpath()
没有名为unicode-object
的函数,因此您收到此错误。您必须提供response object
/ scrapy selector-object
而不是unicode
才能使用xpath()
功能。