我正在尝试通过以下代码从YouTube播放列表中抓取链接:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pyperclip
import time
url = input('Please enter youtube playlist url: ')
driver = webdriver.Firefox()
driver.get(url)
elem = driver.find_element_by_tag_name('html')
elem.send_keys(Keys.END)
time.sleep(3)
elem.send_keys(Keys.END)
innerHTML = driver.execute_script("return document.body.innerHTML")
soup = bs(innerHTML, 'html.parser')
res = soup.select('div#content.style-scope.ytd-playlist-video-renderer a.yt-simple-endpoint.style-scope.ytd-playlist-video-renderer')
whole_list = ''
for i in res:
print(i.get('href'))
print(i['href'])
print(i.attrs['href'])
# whole_list = whole_list + " '" + i.get('href') + "', \n"
print(whole_list)
pyperclip.copy(whole_list)
driver.close()
在chrome开发人员工具中,YouTube的播放列表视频组件如下所示:
<a class="yt-simple-endpoint style-scope ytd-playlist-video-renderer" href="/watch?v=QXeEoD0pB3E&list=PLsyeobzWxl7poL9JTVyndKe62ieoN-MZ3&index=2&t=0s">
<ytd-thumbnail id="thumbnail" height="68" width="120" class="style-scope ytd-playlist-video-renderer">
<a id="thumbnail" class="yt-simple-endpoint inline-block style-scope ytd-thumbnail" aria-hidden="true" tabindex="-1" rel="null" href="/watch?v=QXeEoD0pB3E&list=PLsyeobzWxl7poL9JTVyndKe62ieoN-MZ3&index=2&t=0s">
<yt-img-shadow class="style-scope ytd-thumbnail no-transition" style="background-color: transparent;" loaded=""><img id="img" class="style-scope yt-img-shadow" alt="" width="120" src="https://i.ytimg.com/vi/QXeEoD0pB3E/hqdefault.jpg?sqp=-oaymwEZCPYBEIoBSFXyq4qpAwsIARUAAIhCGAFwAQ==&rs=AOn4CLCsnnE_5VNrXFHejH29sP0T7NSSmw"></yt-img-shadow>
<div id="overlays" class="style-scope ytd-thumbnail"><ytd-thumbnail-overlay-resume-playback-renderer class="style-scope ytd-thumbnail"><div id="progress" class="style-scope ytd-thumbnail-overlay-resume-playback-renderer" style="width: 100%;"></div></ytd-thumbnail-overlay-resume-playback-renderer><ytd-thumbnail-overlay-time-status-renderer class="style-scope ytd-thumbnail" overlay-style="DEFAULT"><span class="style-scope ytd-thumbnail-overlay-time-status-renderer" aria-label="66 seconds">
1:06
</span></ytd-thumbnail-overlay-time-status-renderer><ytd-thumbnail-overlay-now-playing-renderer class="style-scope ytd-thumbnail">
<span class="style-scope ytd-thumbnail-overlay-now-playing-renderer">Now playing</span>
</ytd-thumbnail-overlay-now-playing-renderer></div>
<div id="mouseover-overlay" class="style-scope ytd-thumbnail"></div>
<div id="hover-overlays" class="style-scope ytd-thumbnail"></div>
</a>
</ytd-thumbnail>
<div id="meta" class="style-scope ytd-playlist-video-renderer">
<h3 class="style-scope ytd-playlist-video-renderer">
<ytd-badge-supported-renderer class="style-scope ytd-playlist-video-renderer">
<dom-repeat id="repeat" as="badge" class="style-scope ytd-badge-supported-renderer"><template is="dom-repeat"></template></dom-repeat>
</ytd-badge-supported-renderer>
<span id="video-title" class="style-scope ytd-playlist-video-renderer" aria-label="#0 Python Tutorial | Python Programming Tutorial for Beginners | Course Introduction by Telusko 1 year ago 66 seconds 1,108,432 views" title="#0 Python Tutorial | Python Programming Tutorial for Beginners | Course Introduction">
#0 Python Tutorial | Python Programming Tutorial for Beginners | Course Introduction
</span>
</h3>
<ytd-video-meta-block class="playlist style-scope ytd-playlist-video-renderer">
<div id="metadata" class="style-scope ytd-video-meta-block">
<div id="byline-container" class="style-scope ytd-video-meta-block">
<ytd-channel-name id="channel-name" class="style-scope ytd-video-meta-block">
<div id="container" class="style-scope ytd-channel-name">
<div id="text-container" class="style-scope ytd-channel-name">
<yt-formatted-string id="text" class="style-scope ytd-channel-name complex-string" ellipsis-truncate="" title="Telusko" has-link-only_=""><a class="yt-simple-endpoint style-scope yt-formatted-string" spellcheck="false" href="/user/javaboynavin">Telusko</a></yt-formatted-string>
</div>
</div>
<ytd-badge-supported-renderer class="style-scope ytd-channel-name" disable-upgrade="" hidden="">
</ytd-badge-supported-renderer>
</ytd-channel-name>
<div id="separator" class="style-scope ytd-video-meta-block">•</div>
</div>
<div id="metadata-line" class="style-scope ytd-video-meta-block">
<dom-repeat strip-whitespace="" class="style-scope ytd-video-meta-block"><template is="dom-repeat"></template></dom-repeat>
</div>
</div>
<div id="additional-metadata-line" class="style-scope ytd-video-meta-block">
<dom-repeat class="style-scope ytd-video-meta-block"><template is="dom-repeat"></template></dom-repeat>
</div>
</ytd-video-meta-block>
</div>
<ytd-badge-supported-renderer id="badges" class="style-scope ytd-playlist-video-renderer" disable-upgrade="" hidden="">
</ytd-badge-supported-renderer>
<yt-formatted-string id="contributor" class="style-scope ytd-playlist-video-renderer" hidden=""></yt-formatted-string>
</a>
如您所见,我正在尝试使用我在网上找到的所有三个建议,即使用i.get('href')给我空值;而其余两个选项给我错误。自昨天以来,我一直陷于这一困境,找不到我做错了什么。
答案 0 :(得分:0)
有时<a>
可能没有href
,所以我会用if
跳过它。
for i in res:
href = i.get('href')
if href:
whole_list = whole_list + " '" + href + "', \n"
此代码为我提供了一些播放列表的所有href。而且您看到它的第一个None
也得到了i
,但是我跳过了这个值。
from bs4 import BeautifulSoup as BS
from selenium import webdriver
import pyperclip
import time
#url = input('Please enter youtube playlist url: ')
url = 'https://www.youtube.com/playlist?list=PLmNPvQr9Tf-a4MrEG5thq3qzlkrF5NFbC'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
html = driver.page_source
soup = BS(html, 'html.parser')
res = soup.select('a.yt-simple-endpoint.style-scope.ytd-playlist-video-renderer')
all_hrefs = []
for i in res:
href = i.get('href')
print(href)
if href:
all_hrefs.append(href)
text = ',\n'.join([" '{}'".format(x) for x in all_hrefs])
print(text)
pyperclip.copy(text)
driver.close()