如何获取当前代码以返回链接的“a”而不是整个链接。这是我如何返回整个链接,但我只想要'a'。
item ['Url'] = response.url
例如http://international.southwales.ac.uk/country/iran/en/将是'伊朗'。
from scrapy.spider import BaseSpider
from project.items import QualificationItem
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from urlparse import urljoin
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
class recursiveSpider(BaseSpider):
name = 'usw1'
allowed_domains = ['international.southwales.ac.uk''eu.southwales.ac.uk/']
start_urls = ['http://international.southwales.ac.uk/countries']
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '/html/body/div[1]/div[4]/div[2]/ul/li/a/@href'
for link in hxs.select(xpath).extract():
yield Request(urljoin(response.url, link),
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
def parse_linkpage(self, response):
hxs = HtmlXPathSelector(response)
item = QualificationItem()
xpath = """
//h4[normalize-space(.)="Entry Requirements - Undergraduate"]
/following-sibling::ul/li
"""
item['Qualification'] = hxs.select(xpath).extract()[0:1]
item['Url'] = response.url
return item
答案 0 :(得分:2)
这可以通过使用Request调用的meta
属性来实现。文档here谈论它。
将parse
方法更改为:
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '/html/body/div[1]/div[4]/div[2]/ul/li/a/@href'
a_of_the_link = '/html/body/div[1]/div[4]/div[2]/ul/li/a/text()'
for text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):
yield Request(urljoin(response.url, link), meta={'a_of_the_link': text},
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
您可以在parse_item
中将其作为:
item['Url'] = response.meta['a_of_the_link']
希望这有帮助