Python beautifulsoup解析速度提高

时间:2016-12-08 19:58:30

标签: python performance python-3.x beautifulsoup

目前我编写了我的第一个python脚本,以便循环访问CSV中列出的某些URL。超过14,000个链接。我试图1)获取所有关键字标签2)检查页面状态(404链接需要标记)。 3)将youtube视频转换为嵌入youtube链接(之后可能会转到获取关键字的网页,然后转换为嵌入链接)

它变得如此缓慢,但我无法找到更快的方法。我觉得它是request.get()属性,但我不知道如何加快速度。我只需要元数据但是他们只能获取页面的开头而不是所有的数据?如何使这段代码更好/更快/更优化。

同样在使用pyinstaller进行编译时,我收到了收集问题。我觉得我在python 3中使用python 2代码..因为我正在使用python 3.5编写

import requests
from bs4 import BeautifulSoup
import csv
import re
import time

linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'

with open(newfilename, "w", newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(('cmsid', 'filepath', 'metatags', 'pageurl', 'pageurlchange'))
file.close()
with open(linkLocation, "r", encoding="utf-8-sig") as f:
    csv_f = csv.reader(f, delimiter=",")
    next(csv_f, None)
    for row in csv_f:
        if len(row) != 0:
            # init variables
            myKeywords = ""
            myTitle = ''
            myURL = ''
            pageUrlChange = ''
            pageStatus = ''
            pageUrl = ''
            myCmsid = (row[0])
            myURL = (row[2])
            if "https://www.youtube.com/embed/" in myURL:
                youtubeurl = myURL.split('/')
                youtubeurl = youtubeurl[4]
                youtubeurl = re.sub(
                    r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
                myURL = 'https://www.youtube.com/watch?v=' + youtubeurl
            try:    
                source_code = requests.get(myURL)
            except Exception:
                with open('errors.txt', 'a', newline='') as file:
                    writer = csv.writer(file, delimiter=',')
                    writer.writerow((myCmsid, myURL))
                file.close()
            pageStatus = source_code.status_code
            plain_text = source_code.text
            soup = BeautifulSoup(plain_text, 'html.parser')
            pageStatus = str(pageStatus)
            pageStatus = pageStatus[:1]
            pageStatus = int(pageStatus)
            if pageStatus == 2:
                pageUrlChange = 0
            else:
                pageUrlChange = 1
            if pageStatus == 3:
                pageUrl = source_code.url
            l = soup.findAll("meta", attrs={"name": "keywords"})
            if l is None:
                myKeywords = ""
            else:
                try:
                    myKeywords = l[0]['content']
                except:
                    myKeywords = myKeywords
                myKeywords = myKeywords.replace(', ', '~')
                myKeywords = myKeywords.replace(',', '~')
                myKeywords = myKeywords.replace('(', '')
                myKeywords = myKeywords.replace(')', '')
            if soup.find('title'):
                myTitle = soup.find('title').string
            if "https://www.youtube.com/" in myURL:
                youtubeurl = myURL.split('/')
                youtubeurl = youtubeurl[3]
                youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
                myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
            if "https://youtu.be/" in myURL:
                youtubeurl = myURL.split('/')
                youtubeurl = youtubeurl[3]
                youtubeurl = re.sub(
                    r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
                myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
#            print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
            with open(newfilename, "a", newline='') as file:
                writer = csv.writer(file, delimiter=',')
                writer.writerow((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
            file.close()
f.close()

2 个答案:

答案 0 :(得分:5)

html.parser是一个使用正则表达式的纯python实现。你真的不想用它。安装lxml并在C代码中完成解析(请记住然后使用BeautifulSoup(plain_text, 'lxml')

您也不想继续重新打开CSV文件。在循环外打开一次,只需将新行写入循环中的csv.writer()对象。

否则,您无法加快网址加载速度。网络速度始终是一个瓶颈。您可以使用 very 低级别PyCurl library,但我怀疑它可以提供的加速对此产生影响。

答案 1 :(得分:1)

除了移植到更快的xml解析器的优秀建议之外,这是通过multiprocessing模块进行并行化的良好候选者。我已重新安排您的代码,以便在可以委派给子流程的worker中执行请求/解析。 worker返回需要添加到csv的行。我在返回行的前面添加了一个0 / -1错误代码,以便父进程知道哪个csv得到了结果。

import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import multiprocessing
import traceback

def grabber(myCmsid, myURL):
    try:
        return grabber_impl(myCmsid, myURL)
    except:
        return (-1, myCmsid, myURL, traceback.format_exc())

def grabber_impl(myCmsid, myURL):
    # init variables
    myKeywords = ""
    myTitle = ''
    myURL = ''
    pageUrlChange = ''
    pageStatus = ''
    pageUrl = ''
    if "https://www.youtube.com/embed/" in myURL:
        youtubeurl = myURL.split('/')
        youtubeurl = youtubeurl[4]
        youtubeurl = re.sub(
            r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
        myURL = 'https://www.youtube.com/watch?v=' + youtubeurl

    source_code = requests.get(myURL)
    pageStatus = source_code.status_code
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    pageStatus = str(pageStatus)
    pageStatus = pageStatus[:1]
    pageStatus = int(pageStatus)
    if pageStatus == 2:
        pageUrlChange = 0
    else:
        pageUrlChange = 1
    if pageStatus == 3:
        pageUrl = source_code.url
    l = soup.findAll("meta", attrs={"name": "keywords"})
    if l is None:
        myKeywords = ""
    else:
        try:
            myKeywords = l[0]['content']
        except:
            myKeywords = myKeywords
        myKeywords = myKeywords.replace(', ', '~')
        myKeywords = myKeywords.replace(',', '~')
        myKeywords = myKeywords.replace('(', '')
        myKeywords = myKeywords.replace(')', '')
    if soup.find('title'):
        myTitle = soup.find('title').string
    if "https://www.youtube.com/" in myURL:
        youtubeurl = myURL.split('/')
        youtubeurl = youtubeurl[3]
        youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
        myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
    if "https://youtu.be/" in myURL:
        youtubeurl = myURL.split('/')
        youtubeurl = youtubeurl[3]
        youtubeurl = re.sub(
            r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
        myURL = 'https://www.youtube.com/embed/' + youtubeurl
#                print(youtubeurl)
#            print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
    return (0, myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))


linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'

with open(linkLocation, "r", encoding="utf-8-sig") as f:
    csv_f = csv.reader(f, delimiter=",")
    next(csv_f, None)
    pool = multiprocessing.Pool()

    with open(newfilename, 'a', newline='') as out, open('errors.txt', 'a', newline='') as err:
        writer = csv.writer(out, delimiter=',')
        err_writer = csv.writer(err, delimiter=',')
        for result in pool.imap_unordered(grabber, ((row[0], row[2]) for row in csv_f if row), chunksize=1):
            if result[0]:
                writer.writerow(result[1:])
            else:
                print(result[3])
                err_writer.writerow(result[1:3])
pool.close()
pool.join()