如何使用 selenium 在 youtube 中抓取评论 [仅加载 1000 条评论]?

时间:2021-03-26 04:20:16

标签: python selenium web-scraping youtube

嗨,我正在尝试使用一个 GitHub 脚本从 youtube 上抓取评论,但是在大约 1000 条评论被抓取

后,youtube 停止加载评论

这是我的代码

import csv
import io
from selenium import webdriver
from selenium.common import exceptions
import sys
import time

def scrape():

     driver = webdriver.Chrome('D:\Scraping-Youtube-Comments-master\webdrivers\chromedriver.exe')
     driver.get('https://www.youtube.com/watch?v=kffacxfA7G4')
     driver.maximize_window()
     time.sleep(5)

     try:
         # Extract the elements storing the video title and
         # comment section.
    
    
         title = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string').text
   
         comment_section = driver.find_element_by_xpath('//*[@id="comments"]')

     except exceptions.NoSuchElementException:
    # Note: Youtube may have changed their HTML layouts for
    # videos, so raise an error for sanity sake in case the
    # elements provided cannot be found anymore.
         print("this is error")
         error = "Error: Double check selector OR "
         error += "element may not yet be on the screen at the time of the find operation"
         print(error)

     # Scroll into view the comment section, then allow some time
     # for everything to be loaded as necessary.
     driver.execute_script("arguments[0].scrollIntoView();", comment_section)
     time.sleep(7)

     # Scroll all the way down to the bottom in order to get all the

     # elements loaded (since Youtube dynamically loads them).
     last_height = driver.execute_script("return document.documentElement.scrollHeight")
     count = 0
     while True:
         # Scroll down 'til "next load".
         driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

         # Wait to load everything thus far.
         time.sleep(2)
    
    
         # Calculate new scroll height and compare with last scroll height.
         new_height = driver.execute_script("return document.documentElement.scrollHeight")
         if new_height == last_height:
             break
         last_height = new_height

     # One last scroll just in case.
     driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

     try:
         # Extract the elements storing the usernames and comments.
         username_elems = driver.find_elements_by_xpath('//*[@id="author-text"]')
         comment_elems = driver.find_elements_by_xpath('//*[@id="content-text"]')
     except exceptions.NoSuchElementException:
         error = "Error: Double check selector OR "
         error += "element may not yet be on the screen at the time of the find operation"
         print(error)

     print("> VIDEO TITLE: " + title + "\n")

     with io.open('results.csv', 'w', newline='', encoding="utf-16") as file:
       
         writer = csv.writer(file, delimiter =",", quoting=csv.QUOTE_ALL)
         writer.writerow(["Username", "Comment"])
         for username, comment in zip(username_elems, comment_elems):
             writer.writerow([username.text, comment.text])

     driver.close()
   scrape()

注意:代码在 1000 条评论后才停止工作,互联网也很稳定我检查了两次。 是因为 youtube 阻止了这种抓取,还是代码本身存在任何其他问题。

0 个答案:

没有答案