嗨,我正在尝试使用一个 GitHub 脚本从 youtube 上抓取评论,但是在大约 1000 条评论被抓取
后,youtube 停止加载评论这是我的代码
import csv
import io
from selenium import webdriver
from selenium.common import exceptions
import sys
import time
def scrape():
driver = webdriver.Chrome('D:\Scraping-Youtube-Comments-master\webdrivers\chromedriver.exe')
driver.get('https://www.youtube.com/watch?v=kffacxfA7G4')
driver.maximize_window()
time.sleep(5)
try:
# Extract the elements storing the video title and
# comment section.
title = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string').text
comment_section = driver.find_element_by_xpath('//*[@id="comments"]')
except exceptions.NoSuchElementException:
# Note: Youtube may have changed their HTML layouts for
# videos, so raise an error for sanity sake in case the
# elements provided cannot be found anymore.
print("this is error")
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
# Scroll into view the comment section, then allow some time
# for everything to be loaded as necessary.
driver.execute_script("arguments[0].scrollIntoView();", comment_section)
time.sleep(7)
# Scroll all the way down to the bottom in order to get all the
# elements loaded (since Youtube dynamically loads them).
last_height = driver.execute_script("return document.documentElement.scrollHeight")
count = 0
while True:
# Scroll down 'til "next load".
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
# Wait to load everything thus far.
time.sleep(2)
# Calculate new scroll height and compare with last scroll height.
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# One last scroll just in case.
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
try:
# Extract the elements storing the usernames and comments.
username_elems = driver.find_elements_by_xpath('//*[@id="author-text"]')
comment_elems = driver.find_elements_by_xpath('//*[@id="content-text"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
print("> VIDEO TITLE: " + title + "\n")
with io.open('results.csv', 'w', newline='', encoding="utf-16") as file:
writer = csv.writer(file, delimiter =",", quoting=csv.QUOTE_ALL)
writer.writerow(["Username", "Comment"])
for username, comment in zip(username_elems, comment_elems):
writer.writerow([username.text, comment.text])
driver.close()
scrape()
注意:代码在 1000 条评论后才停止工作,互联网也很稳定我检查了两次。 是因为 youtube 阻止了这种抓取,还是代码本身存在任何其他问题。