我正在用python编写网站克隆程序,对于大多数文件来说也不错,但是我发现在获取背景图片的网址方面遇到了挑战,例如
<div style="background-image: url(images/banner.jpg)" >
该脚本将背景图像检测为文件夹,并假定url为'background_image:url(images / banner.jpg'。如何设置它以获取实际的URL。
Python 2.7
import urllib2
import sys
import socket
import os
import re
socket.setdefaulttimeout(15)
dataTypesToDownload = [".jpg", ".jpeg", ".png", ".gif", ".ico", ".css", ".js", ".html"]
url = 'http://example.com/'
pathbase = 'theme'
if "http://" not in url and "https://" not in url:
url = "http://"+url
try:
os.mkdir(pathbase)
except OSError:
pass
file = open(pathbase + "/index.html", "w")
try:
content = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "An error occured: " + str(e.reason)
exit()
resources = re.split("=\"|='", content)
first = False
for resource in resources:
if first == False:
first = True
continue
resource = re.split("\"|'", resource)[0]
if any(s in resource for s in dataTypesToDownload):
print "Downloading " + resource
try:
path = resource.split("/")
if len(path) != 1:
path.pop(len(path) - 1)
trail = "./" + pathbase + "/"
for folder in path:
trail += folder+"/"
try:
os.mkdir(trail)
except OSError:
pass
except IOError:
pass
try:
if "?" in resource:
download = open(pathbase + "/"+resource.split("?")[len(resource.split("?")) - 2], "w")
else:
download = open(pathbase + "/"+resource, "w")
print url+"/"+resource
dContent = urllib2.urlopen(url+"/"+resource).read()
except urllib2.URLError as e:
print "An error occured: " + str(e.reason)
download.close()
continue
except IOError:
pass
continue
download.write(dContent)
download.close()
print "Downloaded!"
file.write(content)
file.close()
我希望它遇到style="background-image: url(images/banner.jpg)
,
它将资源设置为images / banner.jpg。但这会将资源设置为background-image: url(images/images.jpg