我试图用downloadlinks构建我自己的RSS
但是rss feed只提供整个赛季的链接。
我将这个链接带到整个赛季,并希望提取特定的下载链接到剧集本身(上传/ ul)
这是迄今为止我所拥有的。 任何可能的工作?!
import feedparser, urllib2, re
from BeautifulSoup import BeautifulSoup
episodenliste = ['Game.of.Thrones','Arrow']
episode_link = []
episode_title = []
d = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml')
for post in d.entries:
if ('DEUTSCH' in post.title) and any (word in post.title for word in episodenliste) and ('720p' in post.title):
post.title = post.title.replace('[DEUTSCH] ','')
post.title = re.sub(r'(.*S\d+E\d+)(.*)',r'\1' ,post.title)
episode_link.append(post.link)
episode_title.append(post.title)
print post.title + ": " + post.link + "\n"
for search_title in episode_title:
for get_dlLink in episode_link:
page_ = urllib2.Request(get_dlLink)
page = urllib2.urlopen(page_).read()
soup = BeautifulSoup(page)
print search_title
title = soup.find('strong', text=search_title)
if title is not None:
print title
# link = title.parent
# links = link.find_all('a')
# print links
# for link2 in links:
# url = link2['href']
# print url
# pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % ul
# if re.match(pattern, url):
# print url
据我所知,它可以解决我在网页上搜索标题的问题。
它到达从rss解析的页面。但它没有找到标题。
我的想法是:
首先找到标题,然后从中提取孩子的链接
任何帮助表示赞赏 提前谢谢
答案 0 :(得分:1)
无法启用JavaScript,HTML看起来完全不同:
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | filemonkey.in<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | share-online.biz<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | uploaded.to</p>
由于没有[DEUTSCH]
前缀的RSS源中的标题是该系列页面上段落中的第一个文本,因此它可以作为搜索和提取条目的基础。其中的两个元素是<p>
标记,其中包含该集的所有数据。这是链接,后跟文件主机名称。
import feedparser
import requests
from bs4 import BeautifulSoup
FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'
def is_interesting_entry(entry, title_prefix, series_names):
return (
entry.title.startswith(title_prefix)
and any(name in entry.title for name in series_names)
)
def process_entry(entry, title_prefix):
if not entry.title.startswith(title_prefix):
raise ValueError(
'expected prefix {0!r} not found in {1!r}'.format(
title_prefix, entry.title
)
)
return (entry.title[len(title_prefix):], entry.link)
def process_feed(feed_url, title_prefix, series_names):
return (
process_entry(entry, title_prefix)
for entry in feedparser.parse(feed_url).entries
if is_interesting_entry(entry, title_prefix, series_names)
)
def get_series_soup(url, cache=dict()):
if url in cache:
return cache[url]
else:
result = BeautifulSoup(requests.get(url).text)
cache[url] = result
return result
def get_download_urls(soup, title):
title_text = soup.find(text=title)
if not title_text:
return dict()
else:
return dict(
(a_tag.next_sibling.strip('| '), a_tag['href'])
for a_tag in title_text.parent.parent('a')
)
def main():
series_names = ['Game.of.Thrones', 'Arrow']
for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
print
print title
hoster2url = get_download_urls(get_series_soup(url), title)
if hoster2url:
for hoster, download_url in sorted(hoster2url.iteritems()):
print '{0:>20s}: {1}'.format(hoster, download_url)
else:
print ' --- No downloads ---'
if __name__ == '__main__':
main()
答案 1 :(得分:0)
<item>
<title>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</title>
<description>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</description>
<pubDate>Fri, 18 Jul 2014 00:00:00 +0200</pubDate>
<link>http://serienjunkies.org/arrow/arrow-staffel-2-hdtvweb-dl-sd720p1080p/</link>
</item>
抱歉,不知道
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br><div id="download_mirrors" class="download_main"><strong>Download:</strong> <a href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank" style="font-size:14px;font-weight:bold;">uploaded.net</a> <span style="font-size:10px">(best speed) </span><br><strong style="margin-left:14px">Mirrors:</strong> <img src="http://serienjunkies.org/media/img/stream/application_cascade.png" style="cursor:pointer;" title="Mirrors zeigen" onclick="toggle("Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS");"><div id="Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" style="display: none;">
<strong style="margin-left:20px">Mirror:</strong> <a href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank">filemonkey.in</a><br>
<strong style="margin-left:20px">Mirror:</strong> <a href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank">share-online.biz</a><br>
</div><div><strong style="margin-left:18px">Usenet:</strong> <a href="http://www.firstload.com/affiliate/log.php?log=50393&fn=Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" target="_blank">Highspeed Mirror</a></div></div></p>