YouTube数据抓取-beautifulsoup和pafy

时间:2018-07-09 10:06:23

标签: python pandas youtube beautifulsoup pafy

我运行的这段代码是我自己编写的,一部分是按照互联网上现有的代码编写的。

此代码将youtube channel id作为来自xlsx文件(我有数千个channel ids)的频道ID列表的输入,并返回带有{列的dataframe Channel NameChannel URLtotal videos在该频道highly used words中(通过删除该频道视频的所有描述并找到最常见的单词)。

频道URL的示例示例为:https://www.youtube.com/channel/UC-0BMIN-fcsFA1DwkCXS8xw。频道ID为UC-0BMIN-fcsFA1DwkCXS8xw的地方。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import json
from bs4 import BeautifulSoup
from collections import namedtuple
import pafy
from pandas import *
import pandas as pd
import pafy
import re
import sys  
import os
from nltk.corpus import stopwords
import os
import errno

df = pd.read_excel("channelReach.xlsx", sheet_name=1)

Video = namedtuple("Video", "Description")

def parse_video_div(div):
    video_id = div.get("data-context-item-id", "")
#         title = div.find("a", "yt-uix-tile-link").text
#         duration = div.find("span", "video-time").contents[0].text
#         views = str(div.find("ul", "yt-lockup-meta-info").contents[0].text.rstrip(" views").replace(",", ""))
    videoDescription = pafy.new("https://www.youtube.com/watch?v="+video_id)
    Description = videoDescription.description

    return pd.DataFrame(data=[Video(Description)])

def parse_videos_page(page):
    video_divs = page.find_all("div", "yt-lockup-video")
    return [parse_video_div(div) for div in video_divs]

def find_load_more_url(page):
    for button in page.find_all("button"):
        url = button.get("data-uix-load-more-href")
        if url:
            return "http://www.youtube.com" + url

def download_page(url):
    print("Downloading {0}".format(url))
    return urllib.request.urlopen(url).read()

def get_videos(username):
    page_url = "http://www.youtube.com/channel/{0}/videos".format(username)
    page = BeautifulSoup(download_page(page_url))
    videos = parse_videos_page(page)
    page_url = find_load_more_url(page)
    while page_url:
        json_data = json.loads(str(download_page(page_url).decode("utf-8")))
        page = BeautifulSoup(json_data.get("content_html", ""))
        videos.extend(parse_videos_page(page))
        global videos_df
        page_url = find_load_more_url(BeautifulSoup(json_data.get("load_more_widget_html", "")))    
        videos_df = pd.concat(videos, ignore_index=True)       

    return videos_df

def rawCleanfile(videos_df):
        ## saving all the raw descriptions
        np.savetxt(r'Rawcorpora.txt', videos_df.Description, fmt='%str', encoding='utf-8')
        ## time to clean them
        filepath1 = 'Rawcorpora.txt'  
        file = open("CleanCorpora.txt", "w", encoding="utf-8")
        with open(filepath1, encoding="utf-8") as fp:  
            line = fp.readline()
            cnt = 1
            while line:
                result = re.sub(r"http\S+", "", line)
                line = fp.readline()
                cnt += 1
                file.write(result.strip())
            file.close()
        return file 

def sorted_word():
    filepath2 ="CleanCorpora.txt"
    if not os.path.isfile(filepath2):
        print("File path {} does not exist. Exiting...".format(filepath2))
        sys.exit()
    bag_of_words = {}
    with open(filepath2, encoding="utf-8") as fp:
        cnt = 0
        for line in fp:
            record_word_cnt(line.strip().split(' '), bag_of_words)
            cnt += 1
    sorted_words = order_bag_of_words(bag_of_words, desc=True)
    return(sorted_words)

def stop_words(allwords):
    filtered_words = [word for word in allwords if word[0] not in stopwords.words('english')]
    return(filtered_words[:10])

def order_bag_of_words(bag_of_words, desc=False):  
    words = [(word, cnt) for word, cnt in bag_of_words.items()]
    return sorted(words, key=lambda x: x[1], reverse=desc)

def record_word_cnt(words, bag_of_words):  
    for word in words:
        if word != '':
            if word.lower() in bag_of_words:
                bag_of_words[word.lower()] += 1
            else:
                bag_of_words[word.lower()] = 1

def silentremove():
    try:
        os.remove("CleanCorpora.txt")
        os.remove("Rawcorpora.txt")

    except OSError as e: # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
            raise # re-raise exception if a different error occurred

data = [] 

final = pd.DataFrame()

if __name__ == '__main__':
    for i in df['CRAWL URL'][:10]:
        id = i.split("/channel/",1)[1]
        videos = get_videos(id)
        totalVideos = (len(videos))
        print("Here..0")
        f = rawCleanfile(videos)
        print("Finished writing to Raw and Cleaning them....")

        allwords = sorted_word()
        x = stop_words(allwords)
        words= []
        count= []
        print("Here ..1 ")
        for j in x:
            words.append(j[0])
            count.append(j[1])

        request = requests.get(i)
        content = request.content
        soup = BeautifulSoup(content, "html.parser")
        title = soup.find_all(attrs={"name":"title"})
        titlecontent = title[0]['content']
        print(titlecontent)
        print("got the Channel title..2 ")
        print("URL no: ", i)
        data.append((titlecontent, i, totalVideos, words[:10]))
        cols=['Channel Name', 'Channel URL', 'totalVideos','Highly Used Words from Video Description']
        a1 = pd.DataFrame(data, columns=cols)
        result = final.append(a1, ignore_index=True)
        print("Finished appending in a dataframe ..3 ")

        x = silentremove()
        print("Finished deleting raw and cleaned files....")

示例输出dataframe如下所示:

enter image description here

查询: 代码没有经过优化,例如,如果我需要抓取100多个youtube频道的详细信息,则会花费太多时间。

问题: 有人可以帮我 优化 清理 这段代码,以使其更快,更有效地运行,而不会破坏例如,弹出不必要的URL,没有try except捕获该异常并忽略它,因为它可能破坏整个运行的代码。

预先感谢:)

0 个答案:

没有答案