我编写了一个简单的脚本,该脚本应该在给定地址(在sitedefs.txt中)请求根站点地图,解析xml,然后下载该页面上的所有子站点地图并将其压缩为gzip。其中一些站点地图已经存在一个压缩文件中,这意味着我必须下载这些文件,然后将其移至相应的目录。但是,硒在尝试下载这些文件时表现得很奇怪。它有时可以成功下载文件,但是在大多数情况下,它似乎只能部分下载文件(文件保持为.crdownload且为垃圾文件-没有EOF),或者似乎根本无法创建任何下载文件。它运行在运行Ubuntu的远程服务器(具有大量资源)上。
import time
import gzip
import lxml.etree as ET
import os
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_experimental_option("prefs", {
"download.default_directory": r"/home/ubuntu/Downloads",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
display = Display(visible=0, size=(800, 600))
display.start()
GZIP_LOC = "/home/ubuntu/Downloads"
def get_sitemap_list(root, list):
if "loc" in root.tag:
#print(root.text)
list.append(root.text)
for child in root:
get_sitemap_list(child, list)
def get_content_from_webdriver(wbdriver, url):
wbdriver.get(url)
return wbdriver.page_source
def consume_dlfile(name, dest, wbdriver=None, url=None):
file_loc = os.path.join(GZIP_LOC, name)
el_time = 0
while not os.path.exists(file_loc):
time.sleep(1)
el_time+=1
if el_time > 200:
print("DOWNLOAD TIMED OUT @",file_loc)
if wbdriver and url:
print(get_content_from_webdriver(wbdriver, url))
exit()
os.rename(file_loc, dest)
def main(driver):
with open("sitedefs.txt") as sdf:
for line in sdf:
if line[0] == "#":
continue
else:
spl = line.split(",")
url = spl[1]
print(url)
origin = spl[0]
file_name = spl[2].strip()
try:
os.mkdir(file_name) #we make a new folder for our junt
except FileExistsError:
pass
content = get_content_from_webdriver(driver, url)
if url[-3:] == ".gz":
download = url.split("/")[-1]
consume_dlfile(download,os.path.join(file_name, "rootf.xml.gz"))
with gzip.open(os.path.join(file_name, "rootf.xml.gz"),"rb") as gfile:
textl = gfile.read()
else:
textl = content
root = ET.fromstring(textl)
my_list = list()
get_sitemap_list(root, my_list)
i=0
for sitemap in my_list:
print(sitemap)
i+=1
path = os.path.join(file_name, str(i)+".xml.gz")
if (sitemap[-3:] == ".gz"):
download = url.split("/")[-1]
consume_dlfile(download, str(i)+".xml.gz", wbdriver=driver, url=sitemap)
else:
with gzip.open(path, 'wb') as f:
f.write(get_content_from_webdriver(driver, sitemap).encode('utf-8'))
if __name__ == '__main__':
driver = webdriver.Chrome(chrome_options=options)
try:
main(driver)
finally:
driver.close()
Sitedefs是一个文件,每一行的格式如下:
https://www.shutterstock.com/,https://cdn.shutterstock.com/sitemaps/video/sitemap/sitemap-video-index.xml,shutterstock
不幸的是,由于各种不同的原因,我们不得不在请求中使用硒。一些网站虽然明确提供了站点地图,但似乎提供了抵御请求的保护-即使提供了正确的标题也是如此。