我需要更改HTML文件中链接的href值,但是由于某种原因,它没有发生。这是我要实现的部分:
a['href'] = internal
这是脚本的描述
此脚本创建文件夹中所有.html文件的列表。反向工程师相应的Web URL是什么(我还有另一个脚本可以下载网页并创建本地版本)。逐个打开每个文件,找到外部链接的URL,并将其替换为本地URL。
import re
import os
from bs4 import BeautifulSoup
def replace_links(destination):
files = []
external_links = []
#r = root, d = directories, f = files
for r, d, f in os.walk(destination):
for file in f:
if '.html' in file:
files.append(os.path.join(r, file))
for f in files:
external_links.append(reverse_eng_url(f, destination))
html(files, external_links)
def reverse_eng_url(origin_url, destination):
return origin_url.replace(destination, "").replace("-","/").replace("\\","/").replace("html", "aspx")
def html(files, external_links):
for local in files:
#open and read content of each file
with open(local, "r") as f:
contents = f.read()
soup = BeautifulSoup(contents, "lxml")
# find links in each file that contain external links and replace them with internal
for internal, ext in zip(files, external_links):
#print ext
for a in soup.find_all("a", href=re.compile(ext)):
#print a
print(a['href'])
print(internal)
#------------------------The problem is here
a['href'] = internal #a[href] is found, internal also exists as a value, but for some reason internal doesn't get assigned to a[href]
#___________________________________________
if __name__ == "__main__":
replace_links("C:\\Transition\\")