有一个网站,其中包含指向其他网页的链接。我想做以下事情:
<p></p>
代码为此,我编写了以下解析器:
def start_requests(self):
date=" "
for url in self.urls:
yield Request(url=url,meta={'date':str(url)[-15:-5]},callback=self.parse)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//*[@id="content"]/div[2]/div/div/div[1]')
passed_date=response.meta.get('date')
items=[]
for site in sites:
item = DmozItem()
# item['title'] = site.xpath('.//div/a/text()').extract()
item['link'] = str(site.xpath('.//a/@href').extract()).replace("u'","")
item['link']=item['link'].replace("'","")
# item['time'] = site.xpath('.//div/text()').extract()
# item['date'] = passed_date
self.all_articles.append(item)
item['text']=[Request(url=link_of_the_article,callback=self.get_text_of_the_article) for link_of_the_article in item['link'].split(',')]
return items
def get_text_of_the_article(self,response):
sel=Selector(response)
article_text=""
item=DmozItem()
item['text']=sel.xpath('//*[@id="articleText"]/p[1]').extract()
return items.append(item)
但是,当我打印item['text']
时,结果是:
{[<GET %20http://www.domain.com/article/2014/01/06/libya-economy-idUSL6N0KD2L320140106%5D>]}
如何获取特定标签之间的文字?
答案 0 :(得分:2)
你看到的原因:
{[<GET %20http://www.domain.com/article/2014/01/06/libya-economy-idUSL6N0KD2L320140106%5D>]}
是因为您要将Request
对象保存在item['link']
中,而不是让它产生。
这就是你应该做的。在线评论中的解释。
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//*[@id="content"]/div[2]/div/div/div[1]')
passed_date=response.meta.get('date')
items=[]
for site in sites:
item = DmozItem()
# No need to replace u'. It is an internal representation for unicode
# strings. It won't affect your data
links = site.xpath('.//a/@href').extract()
self.all_articles.append(item)
# For all the links, yield a Request and send the
# item object
for link in links:
yield Request(url=link, callback=self.get_text_of_the_article, meta={'item': item})
def get_text_of_the_article(self,response):
sel=Selector(response)
# Get the item object sent from parse method
item = response.request.meta['item']
# You can save the link url here
item['link'] = response.url
item['text'] = sel.xpath('//*[@id="articleText"]/p[1]').extract()
return item