我目前正在以一个愚蠢的礼物给我女朋友的项目工作:一个自动下载小狗和小猫以及其他动物图片的程序。 (通过简单地改变一些东西,它将能够下载.pdf,书籍,电影或其他任何东西,所以这是严肃的目标......)。
程序正常运行:它在列表中存储的一系列网站上搜索图像,并随机自动下载。问题是某些图片没有完全下载:它们的大小比正确下载的图片小得多,并且在打开时不会显示任何图像。
为什么有些图片下载得很好,有些图片虽然文件在那里,但是尺寸太小而且在打开时没有显示图像?我在哪里弄错了?
我真的想纠正这个。我是Python的新手,所以任何其他的建议都非常受欢迎(主要是因为它们与减少我在look_images(x,z)函数上编写的一堆杂乱的代码有关,所有这些尝试,除了和while循环代码和东西...)。谢谢!这是脚本:
#! python3
import os, requests, bs4, random, shelve, wget
def create_folder(): #Creates the folder 'Puppies'
print('Comprobando pelusitas y pulguitas...')
os.makedirs('Puppies', exist_ok=True)
def download_image(url, request_response): #Saves the image in the folder 'Puppies'
image_file = open(os.path.join('Puppies', os.path.basename(url)), 'wb')
for chunk in request_response.iter_content(chunk_size=1024):
image_file.write(chunk)
image_file.flush()
image_file.close()
def create_saves(saves_list): #Creates a data file, if not yet created, storing the pictures already downloaded.
if os.path.isfile('C:\\Users\\usuario\\Documents\\santi\\saves.dat') == False:
page_files = shelve.open('saves')
page_files['saves'] = saves_list
page_files.close()
def look_images(pages_list, saves_list):
for i in pages_list: #For every item in the pages_list (for every page...)
res = requests.get(i)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
object = soup.select('img')
object_number = len(object)
random_number = random.randint(0, int(object_number)) #Creates a random number between 0 and the number of images the object variable found.
print('\nSe encontraron {} cachorritos potenciales...'.format(int(object_number)))
try:
image_url = object[random_number].get('srcset')
except IndexError:
print('No se hallaron vauitos, se continuará la búsqueda...')
print('\nEvaluando colmillitos y patitas...')
page_files = shelve.open('saves') #Opens the saves data file.
while str(image_url) in open('saves.dat', encoding='Latin-1').read(): #While the image found is stored in the saves.dat file, select another image.
try:
image_url = object[random.randint(0, int(object_number))].get('src')
except IndexError:
continue
while not '.jpg' in str(image_url):
try:
image_url = object[random.randint(0, int(object_number))].get('src')
if str(image_url) in open('saves.dat', encoding='Latin-1').read():
image_url = object[random.randint(0, int(object_number))].get('src')
continue
except IndexError:
print('No se hallaron vauitos, se continuará la búsqueda...')
continue
print('\nSe encontraron vauitos...')
print('\nAdoptando cachorrito...')
if str(image_url).endswith('.jpg 2x'): #Lot of images were downloaded as '.jpg 2x', so I made this if statement to erase the ' 2x' final part.
image_url = str(image_url.replace(' ', '')[:-2])
response = requests.get(image_url, stream=True)
res.raise_for_status()
saves_list.append(image_url) #Adds the image to the save_list, which is then saved on the .dat 'saves' file.
download_image(image_url, response)
print('¡Cachorrito adoptado!')
page_files[image_url] = page_files #Saves the image url on the 'saves'.dat file
page_files.close()
def get_page():
page = ['https://pixabay.com/es/photos/puppy/',
'https://www.petsworld.in/blog/cute-pictures-of-puppies-and-kittens-together.html',
'https://pixabay.com/es/photos/bear%20cubs/',
'https://pixabay.com/es/photos/?q=cute+little+animals&hp=&image_type=photo&order=popular&cat=',
'https://pixabay.com/es/photos/?q=baby+cows&hp=&image_type=photo&order=popular&cat=',
'https://www.boredpanda.com/cute-baby-animals/',
'http://abduzeedo.com/node/74367']
alr_dow = []
create_folder()
create_saves(alr_dow)
look_images(page, alr_dow)
get_page()
P.S。印刷的信息是西班牙语,与问题无关。请注意,目标是在某个时候将此作为一个严肃的程序,下载严肃的事情。
问题已经解决了。显然,在请求某些页面图像的URL时出现了问题。我没有注意到这个错误,因为我的错误,在look_images函数的末尾:“res.raise_for_status()”应该是“response.raise_for_status()”,因为“response”是存储的变量图片网址的请求。如果更正了,我注意到在页面列表的第二页和最后一页上发出了一个请求错误:由于显然与某些端口和防火墙问题有关的错误,这些站点中的文件未正确下载。这是现在的代码,功能正常,也比第一个更干净:
#! python3
import os, requests, bs4, random, shelve, shutil
def create_folder():
print('Comprobando pelusitas y pulguitas...')
os.makedirs('Puppies', exist_ok=True)
def download_image_shutil(url, request_response):
local_filename = url.split('/')[-1]
with open(os.path.join('Puppies', local_filename), 'wb') as f:
shutil.copyfileobj(request_response.raw, f, [False])
return local_filename
def create_saves(saves_list):
if os.path.isfile('C:\\Users\\usuario\\Documents\\santi\\saves.dat') == False:
page_files = shelve.open('saves')
page_files['saves'] = saves_list
page_files.close()
def get_random_url(web_object, web_object_number):
while True:
try:
random_number = random.randint(0, web_object_number)
url_variable = web_object[random_number].get('src')
if not '.jpg' in str(url_variable):
continue
elif str(url_variable) in open('saves.dat', encoding='Latin-1').read():
continue
print('Finish')
print(url_variable)
return url_variable
break
except IndexError:
print('Algo salió mal: reanundando búsqueda...')
continue
def look_images(pages_list, saves_list):
for i in pages_list:
res = requests.get(i)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
object = soup.select('img')
object_number = len(object)
print('\nSe encontraron {} cachorritos potenciales...'.format(int(object_number)))
print('\nEvaluando colmillitos y patitas...')
page_files = shelve.open('saves')
image_url = get_random_url(object, object_number)
print('\nSe encontraron vauitos...')
print('\nAdoptando cachorrito...')
if str(image_url).endswith('.jpg 2x'):
image_url = str(image_url.replace(' ', '')[:-2])
if not str(image_url).startswith('https://'):
image_url = 'https://' + str(image_url)
response = requests.get(image_url, stream=True)
response.raise_for_status()
saves_list.append(image_url)
download_image_shutil(image_url, response)
print('¡Cachorrito adoptado!')
page_files[image_url] = page_files
page_files.close()
def get_page():
page = ['https://pixabay.com/es/photos/puppy/',
'https://www.petsworld.in/blog/cute-pictures-of-puppies-and-kittens-together.html',
'https://pixabay.com/es/photos/bear%20cubs/',
'https://pixabay.com/es/photos/?q=cute+little+animals&hp=&image_type=photo&order=popular&cat=',
'https://pixabay.com/es/photos/?q=baby+cows&hp=&image_type=photo&order=popular&cat=',
'https://www.boredpanda.com/cute-baby-animals/']
alr_dow = []
create_folder()
create_saves(alr_dow)
look_images(page, alr_dow)
get_page()
我将一些变量转换为字符串(使用str(x)函数),因为如果没有错误说明:变量上出现NoneType不可迭代。我不明白它来自哪里;如果有人能告诉我,那就太好了。
我对bruno desthuilliers表示感谢,他非常劝我并且有耐心这样做。