使用BeautifulSoup更改URL

时间:2019-05-31 20:58:06

标签: python beautifulsoup

我需要更改HTML文件中链接的href值,但是由于某种原因,它没有发生。这是我要实现的部分:

a['href'] = internal

这是脚本的描述

此脚本创建文件夹中所有.html文件的列表。反向工程师相应的Web URL是什么(我还有另一个脚本可以下载网页并创建本地版本)。逐个打开每个文件,找到外部链接的URL,并将其替换为本地URL。

import re
import os
from bs4 import BeautifulSoup

def replace_links(destination):
    files = []
    external_links = [] 
    #r = root, d = directories, f = files
    for r, d, f in os.walk(destination):
        for file in f:
            if '.html' in file:
                files.append(os.path.join(r, file))

    for f in files:
        external_links.append(reverse_eng_url(f, destination))

    html(files, external_links)

def reverse_eng_url(origin_url, destination):
    return origin_url.replace(destination, "").replace("-","/").replace("\\","/").replace("html", "aspx")

def html(files, external_links):
    for local in files:
        #open and read content of each file
        with open(local, "r") as f:
            contents = f.read()
            soup = BeautifulSoup(contents, "lxml")
            # find links in each file that contain external links and replace them with internal
            for internal, ext in zip(files, external_links):
                #print ext
                for a in soup.find_all("a", href=re.compile(ext)):
                    #print a
                    print(a['href']) 
                    print(internal)
                    #------------------------The problem is here
                    a['href'] = internal #a[href] is found, internal also exists as a value, but for some reason internal doesn't get assigned to a[href]
                    #___________________________________________


if __name__ == "__main__":
    replace_links("C:\\Transition\\")

0 个答案:

没有答案