Web scraper脚本 - 如何让它更快地运行?

时间:2015-09-15 22:34:26

标签: python performance web-scraping

我刚开始使用python 3并且喜欢阅读轻小说,所以我制作的第一个python项目是一个网页刮擦&下载我最喜欢的轻小说。

到目前为止一切正常,但它确实很慢,尤其是检查章节是否实际位于文件夹中并下载章节。

现在,脚本需要17.8分钟来检查和下载694章。

有没有办法至少加快检查过程?因为所有实际章节只需要下载一次。

https://github.com/alpenmilch411/LN_scrape/blob/master/LN_scraper.py

import requests
from bs4 import BeautifulSoup
import os
import getpass

#Gets chapter links
def get_chapter_links(index_url):
    r = requests.get(index_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    links = soup.find_all('a')
    url_list = []
    for url in links:
        if 'http://www.wuxiaworld.com/cdindex-html/book' in str(url):
            url_list.append((url.get('href')))
    return url_list

#Gets chapter content
def get_chapters(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    chapter_text = soup.find_all('div',{'class':"entry-content"})
    #Puts chapter text into 'chapter'-variable
    chapter = ''
    for c in chapter_text:
      #Removing 'Previous Next Chapter'
      content = c.text.strip()                              # strip??
      chapter += content.strip('Previous Next Chapter')     # strip??
    return chapter

#Gets title of chapter
def get_title(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    title = soup.find_all('h1',{'class':'entry-title'})
    chapter_title = ''
    for l in title:
       chapter_title += l.text
    return chapter_title

#Gets title of story
def get_story_title(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    story = soup.find_all('h1',{'class':"entry-title"})
    story_title = ''
    for content in story:
       story_title += content.text
    return story_title



#url on which links can be found
links = 'http://www.wuxiaworld.com/cdindex-html/'


#Checks whether a directory already exists and creates a new one if necessary
story_title = get_story_title(links)
path = '/users/{}/documents/'.format(getpass.getuser())+'{}'.format(story_title)
if not os.path.isdir(path):
    os.mkdir(path)
link_list = get_chapter_links(links)
#Copys chapters into text file
for x in link_list:
    #Checks whether chapter already exists
    #TODO Make checking process quicker
    chapter_title = get_title(str(x)).replace(',','') + '.txt'
    if not os.path.isfile(path + '/' + chapter_title):
        story_title = get_story_title(links)
        chapter_text = get_chapters(str(x))
        file = open(path + '/' + chapter_title, 'w')
        file.write(chapter_text)
        file.close()
        print('{} saved.'.format(chapter_title.replace(',','')))

print('All chapters are up to date.')

1 个答案:

答案 0 :(得分:0)

将lxml与BeautifulSoup一起使用。它比html.parser

更快