Question

在业余时间，我通常阅读在线Webnovels，并且阅读了很多，甚至跟随数十个定期更新。在比利时，移动数据非常昂贵，因此我们获得的数据几乎不足，因此我决定在Python 3上编写一个程序，仅下载某本小说的指定章节，然后将其手动转移到手机上以进行阅读。注意：我的程序很可能很混乱且效率低下，并且它可能在失败的地方有很多实例，因为我对python有基本的了解，而对html则是零。通过在此站点上四处搜索，还复制了一半代码。

我输入一个包含所有章节(like this)的目录的URL，程序将搜索每个标签和数据（通过使用“ def handle_starttag”和“ handle_data”来查找指向指定章节的超链接，然后下载）包含这些章节的页面并将其保存在目录中。

我遇到的问题是，它不适用于所有网站，例如this，而我推测这是由于章节不在网站上而是被引用了。

您可以在此处阅读代码：

from html.parser import HTMLParser
from urllib.request import Request, urlopen
import os

all_chapter_and_html_id = []
failed_to_download = []
chapter_start = int(input("verify start chapter: "))
chapter_end = int(input("verify end chapter: "))
index_url = str(input("verify the url that contains index: ")).replace(" ", "")

class MyHTMLParser(HTMLParser):

    def handle_starttag(self, tag, attrs):
        # Only parse the 'anchor' tag.
        if tag == "a":
           # Check the list of defined attributes.
           for name, value in attrs:
               if name == "href": # and ".html" in value
                   # print("attrs are: ", attrs)
                   self.html_id = value

    def handle_data(self, data):
        #check if the data containst the word chapter
        if "Chapter" in data:
            index = len("chapter")
            #find where it says the chapter number add to list
            for i in data[index:-1]:
                if i in "0123456789" and data[index+1] == " ": # eg: chapter 145 "some text about chapter name"
                    all_chapter_and_html_id.append((data[len("Chapter") + 1: index+1], self.html_id))
                else:
                    index += 1


def req(url):
        request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        return urlopen(request).read()


def director():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    if not os.path.exists(dir_path +"\\" + name_website(index_url)):
        os.makedirs(dir_path +"\\" + name_website(index_url))


def name_website(url):
    c = 0
    while c < len(url):
        if url[c] == "/" and url[c+1] == "/":
            return (url[c+2:].replace("/", ""))
        c += 1


def download():
    parser = MyHTMLParser()
    parser.feed(str(req(index_url)))
    director()
    for single in all_chapter_and_html_id:
        if int(single[0]) in range(chapter_start, chapter_end + 1):
            with open(name_website(index_url)+"\\chapter " + str(single[0]) + ".html", "wb") as file:
                try:
                    file.write(req(index_url + single[1]))
                    print("Downloaded chapter " + str(single[0]))
                except:
                    failed_to_download.append("chapter "+ single[0])
                    print("-"*25 +"\n Failed to download chapter ", single[0], "\n" + "*"*25)
    if len(failed_to_download) != 0:
        print("*" * 40 + "\nFailed to download next chapters: \n" + "\n".join(failed_to_download) + "\n" + "*"*40)
    else:
        print("\n"*10 + "*" * 40 + "\nSuccessfully downloaded all chapters" + "\n" +"*" *40)

download()

HTMLparsing Python 3下载网站

0 个答案: