<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<name>Some Author</name>
<category term="sports" label="Sports" />
<content type="html">This is the news text.</content>
<link href="http://thenews.com/article/123abc/comments" />
<title>The Title</title>
<name>Some other Author</name>
<category term="sports" label="Sports" />
<content type="html">This is another news text.</content>
<link href="http://thenews.com/article/123abd/comments" />
<title>The other Title</title>
现在我要替换&lt; link href =&#34; http://thenews.com/article/123abc/comments" /&GT;与URL的内容。可以通过在URL末尾添加/ rss来获取RSS提要。所以最后,单个条目看起来像这样:
<name>Some Author</name>
<category term="sports" label="Sports" />
<content type="html">This is the news text.</content>
<author>A commenter</author>
<text>Cool story, yo!</text>
<author>Another commenter</author>
<text>This is interesting news.</text>
<title>The Title</title>
我对任何编程语言都很开放。我用python和lxml尝试了这个但是无法走远。我能够提取评论网址并下载评论Feed但无法替换实际的&lt; link&gt; -tag。 无需下载实际的RSS,这里有多远:
import lxml.etree as et
import urllib2
import re
# These will be downloaded from the RSS feed source when the code works
xmltext = """[The above news feed, too long to paste]"""
commentsRSS = """[The above comments feed]"""
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
article = et.fromstring(xmltext)
for elem in article.xpath('//feed/entry'):
commentsURL = elem.xpath('link/@href')
#request = urllib2.Request(commentsURL[0] + '.rss', headers=hdr)
#comments = urllib2.urlopen(request).read()
comments = commentsRSS
# Now the <link>-tag should be replaced by the comments feed without the <?xml ...> tag
答案 0 :(得分:1)
article = et.fromstring(xmltext)
ns = {'d': 'http://www.w3.org/2005/Atom'}
for elem in article.xpath('//d:feed/d:entry/d:link', namespaces=ns):
request = urllib2.Request(elem.attrib['href'] + '.rss', headers=hdr)
comments = urllib2.urlopen(request).read()
newElem = et.fromstring(comments)
elem.getparent().replace(elem, newElem)
# print the result
print et.tostring(article)