我运行的这段代码是我自己编写的,一部分是按照互联网上现有的代码编写的。
此代码将youtube channel id
作为来自xlsx
文件(我有数千个channel ids
)的频道ID列表的输入,并返回带有{列的dataframe
Channel Name
,Channel URL
,total videos
在该频道highly used words
中(通过删除该频道视频的所有描述并找到最常见的单词)。
频道URL的示例示例为:https://www.youtube.com/channel/UC-0BMIN-fcsFA1DwkCXS8xw。频道ID为UC-0BMIN-fcsFA1DwkCXS8xw
的地方。
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import json
from bs4 import BeautifulSoup
from collections import namedtuple
import pafy
from pandas import *
import pandas as pd
import pafy
import re
import sys
import os
from nltk.corpus import stopwords
import os
import errno
df = pd.read_excel("channelReach.xlsx", sheet_name=1)
Video = namedtuple("Video", "Description")
def parse_video_div(div):
video_id = div.get("data-context-item-id", "")
# title = div.find("a", "yt-uix-tile-link").text
# duration = div.find("span", "video-time").contents[0].text
# views = str(div.find("ul", "yt-lockup-meta-info").contents[0].text.rstrip(" views").replace(",", ""))
videoDescription = pafy.new("https://www.youtube.com/watch?v="+video_id)
Description = videoDescription.description
return pd.DataFrame(data=[Video(Description)])
def parse_videos_page(page):
video_divs = page.find_all("div", "yt-lockup-video")
return [parse_video_div(div) for div in video_divs]
def find_load_more_url(page):
for button in page.find_all("button"):
url = button.get("data-uix-load-more-href")
if url:
return "http://www.youtube.com" + url
def download_page(url):
print("Downloading {0}".format(url))
return urllib.request.urlopen(url).read()
def get_videos(username):
page_url = "http://www.youtube.com/channel/{0}/videos".format(username)
page = BeautifulSoup(download_page(page_url))
videos = parse_videos_page(page)
page_url = find_load_more_url(page)
while page_url:
json_data = json.loads(str(download_page(page_url).decode("utf-8")))
page = BeautifulSoup(json_data.get("content_html", ""))
videos.extend(parse_videos_page(page))
global videos_df
page_url = find_load_more_url(BeautifulSoup(json_data.get("load_more_widget_html", "")))
videos_df = pd.concat(videos, ignore_index=True)
return videos_df
def rawCleanfile(videos_df):
## saving all the raw descriptions
np.savetxt(r'Rawcorpora.txt', videos_df.Description, fmt='%str', encoding='utf-8')
## time to clean them
filepath1 = 'Rawcorpora.txt'
file = open("CleanCorpora.txt", "w", encoding="utf-8")
with open(filepath1, encoding="utf-8") as fp:
line = fp.readline()
cnt = 1
while line:
result = re.sub(r"http\S+", "", line)
line = fp.readline()
cnt += 1
file.write(result.strip())
file.close()
return file
def sorted_word():
filepath2 ="CleanCorpora.txt"
if not os.path.isfile(filepath2):
print("File path {} does not exist. Exiting...".format(filepath2))
sys.exit()
bag_of_words = {}
with open(filepath2, encoding="utf-8") as fp:
cnt = 0
for line in fp:
record_word_cnt(line.strip().split(' '), bag_of_words)
cnt += 1
sorted_words = order_bag_of_words(bag_of_words, desc=True)
return(sorted_words)
def stop_words(allwords):
filtered_words = [word for word in allwords if word[0] not in stopwords.words('english')]
return(filtered_words[:10])
def order_bag_of_words(bag_of_words, desc=False):
words = [(word, cnt) for word, cnt in bag_of_words.items()]
return sorted(words, key=lambda x: x[1], reverse=desc)
def record_word_cnt(words, bag_of_words):
for word in words:
if word != '':
if word.lower() in bag_of_words:
bag_of_words[word.lower()] += 1
else:
bag_of_words[word.lower()] = 1
def silentremove():
try:
os.remove("CleanCorpora.txt")
os.remove("Rawcorpora.txt")
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
data = []
final = pd.DataFrame()
if __name__ == '__main__':
for i in df['CRAWL URL'][:10]:
id = i.split("/channel/",1)[1]
videos = get_videos(id)
totalVideos = (len(videos))
print("Here..0")
f = rawCleanfile(videos)
print("Finished writing to Raw and Cleaning them....")
allwords = sorted_word()
x = stop_words(allwords)
words= []
count= []
print("Here ..1 ")
for j in x:
words.append(j[0])
count.append(j[1])
request = requests.get(i)
content = request.content
soup = BeautifulSoup(content, "html.parser")
title = soup.find_all(attrs={"name":"title"})
titlecontent = title[0]['content']
print(titlecontent)
print("got the Channel title..2 ")
print("URL no: ", i)
data.append((titlecontent, i, totalVideos, words[:10]))
cols=['Channel Name', 'Channel URL', 'totalVideos','Highly Used Words from Video Description']
a1 = pd.DataFrame(data, columns=cols)
result = final.append(a1, ignore_index=True)
print("Finished appending in a dataframe ..3 ")
x = silentremove()
print("Finished deleting raw and cleaned files....")
示例输出dataframe
如下所示:
查询: 代码没有经过优化,例如,如果我需要抓取100多个youtube频道的详细信息,则会花费太多时间。
问题:
有人可以帮我 优化 和 清理 这段代码,以使其更快,更有效地运行,而不会破坏例如,弹出不必要的URL,没有try
except
捕获该异常并忽略它,因为它可能破坏整个运行的代码。
预先感谢:)