我想获取引用锚文本链接。 我如何从引用URL获取传入的锚文本链接?
感谢您的时间!
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "mydomain"
allowed_domains = ["www.mydomain"]
start_urls = ["http://www.mydomain/cp/133162",]
rules = (Rule (SgmlLinkExtractor(allow=('133162', ),deny=('/ip/', 'search_sort=', 'ic=60_0', 'customer_rating', 'special_offers', ),)
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*')
items = []
for site in sites:
item = Website()
item['referer'] = response.request.headers.get('Referer')
item['url'] = response.url
item['title'] = site.xpath('/html/head/title/text()').extract()
item['description'] = site.select('//meta[@name="Description"]/@content').extract()
items.append(item)
return items
更新:根据Guys的建议,以下是我的新代码:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from wallspider.items import Website
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class anchorspider(CrawlSpider):
name = "anchor"
allowed_domains = ["mydomain.com"]
start_urls = ["http://www.mydomain.com/"]
extractor = SgmlLinkExtractor()
rules = (Rule (SgmlLinkExtractor(allow=('133162', ),deny=('/ip/', 'search_sort=', 'ic=60_0', 'customer_rating', 'special_offers', ),)
, callback="parse_items", follow= True),
)
def parse_start_url(self, response):
list(self.parse_links(response))
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a')
for link in links:
anchor_text = ''.join(link.select('./text()').extract())
title = ''.join(link.select('./@title').extract())
url = ''.join(link.select('./@href').extract())
meta={'title':title,}
meta={'anchor_text':anchor_text,}
yield Request(url, callback = self.parse_page, meta=meta,)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
item = Website()
item['anchor_text']=response.meta['anchor_text']
item['url'] = response.url
item['title'] = response.meta['title']
item['referer'] = response.request.headers.get('Referer')
item['description'] = site.select('//meta[@name="Description"]/@content').extract()
return item
我收到以下错误:引发ValueError('请求网址中缺少方案:%s'%self._url)
答案 0 :(得分:4)
实际上响应对象中有response.meta.get('link_text')
。
答案 1 :(得分:2)
我担心在这个阶段得到锚文本为时已晚,相反,你必须使用类似this answer的内容来干扰链接抓取阶段:
def parse_start_url(self, response):
list(self.parse_links(response))
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a')
for link in links:
anchor_text = ''.join(link.select('./text()').extract())
meta={'anchor_text':anchor_text,}
...
yield Request(url, callback = self.parse_items, meta=meta,)
def parse_items(self, response):
item['anchor_text']=response.meta['anchor_text']
yield items ...
请注意在请求中使用meta data
答案 2 :(得分:2)
http请求标头中的Referer
字段未通过scrapy自动设置。是否在您的http请求标头中添加Referer
字段并且仅在您在每个Referer
中设置了Request object
字段的情况下由您自行决定,然后您可以得到它你在你的代码中做了。
参考Request doc,在产生Request
对象时应该怎么做:
def parse_xxx(self, response):
...... # some other operations
headers = {
'Referer': response.url,
}
...... # some other operations
yield Request(..., headers=headers, ...)
如果您想在初始网址中添加Referer
字段,则可以使用start_requests
方法生成Request
个对象,而不是使用start_urls
个变量。这是doc