Python 3请求模块:.iter_content没有正确下载文件

时间:2018-01-23 11:16:04

标签: python download request python-requests

我目前正在以一个愚蠢的礼物给我女朋友的项目工作:一个自动下载小狗和小猫以及其他动物图片的程序。 (通过简单地改变一些东西,它将能够下载.pdf,书籍,电影或其他任何东西,所以这是严肃的目标......)。

程序正常运行:它在列表中存储的一系列网站上搜索图像,并随机自动下载。问题是某些图片没有完全下载:它们的大小比正确下载的图片小得多,并且在打开时不会显示任何图像。

为什么有些图片下载得很好,有些图片虽然文件在那里,但是尺寸太小而且在打开时没有显示图像?我在哪里弄错了?

我真的想纠正这个。我是Python的新手,所以任何其他的建议都非常受欢迎(主要是因为它们与减少我在look_images(x,z)函数上编写的一堆杂乱的代码有关,所有这些尝试,除了和while循环代码和东西...)。谢谢!这是脚本:

#! python3

import os, requests, bs4, random, shelve, wget

def create_folder(): #Creates the folder 'Puppies'
    print('Comprobando pelusitas y pulguitas...')
    os.makedirs('Puppies', exist_ok=True)

def download_image(url, request_response): #Saves the image in the folder 'Puppies'
    image_file = open(os.path.join('Puppies', os.path.basename(url)), 'wb')
    for chunk in request_response.iter_content(chunk_size=1024):
        image_file.write(chunk)
        image_file.flush()
    image_file.close()

def create_saves(saves_list): #Creates a data file, if not yet created, storing the pictures already downloaded.
    if os.path.isfile('C:\\Users\\usuario\\Documents\\santi\\saves.dat') == False:
        page_files = shelve.open('saves')
        page_files['saves'] = saves_list
        page_files.close()

def look_images(pages_list, saves_list):


    for i in pages_list: #For every item in the pages_list (for every page...)
        res = requests.get(i)
        res.raise_for_status()
        soup = bs4.BeautifulSoup(res.text, "html.parser")
        object = soup.select('img')
        object_number = len(object)
        random_number = random.randint(0, int(object_number)) #Creates a random number between 0 and the number of images the object variable found.

        print('\nSe encontraron {} cachorritos potenciales...'.format(int(object_number)))
        try:
            image_url = object[random_number].get('srcset')
        except IndexError:
            print('No se hallaron vauitos, se continuará la búsqueda...')
        print('\nEvaluando colmillitos y patitas...')
        page_files = shelve.open('saves') #Opens the saves data file.
        while str(image_url) in open('saves.dat', encoding='Latin-1').read(): #While the image found is stored in the saves.dat file, select another image.
            try:
                image_url = object[random.randint(0, int(object_number))].get('src')
            except IndexError:
                continue
        while not '.jpg' in str(image_url):
            try:
                image_url = object[random.randint(0, int(object_number))].get('src')
                if str(image_url) in open('saves.dat', encoding='Latin-1').read():
                    image_url = object[random.randint(0, int(object_number))].get('src')
                    continue
            except IndexError:
                    print('No se hallaron vauitos, se continuará la búsqueda...')
                    continue
        print('\nSe encontraron vauitos...')
        print('\nAdoptando cachorrito...')
        if str(image_url).endswith('.jpg 2x'): #Lot of images were downloaded as '.jpg 2x', so I made this if statement to erase the ' 2x' final part.
            image_url = str(image_url.replace(' ', '')[:-2])
        response = requests.get(image_url, stream=True)
        res.raise_for_status()
        saves_list.append(image_url) #Adds the image to the save_list, which is then saved on the .dat 'saves' file.

        download_image(image_url, response)
        print('¡Cachorrito adoptado!')

        page_files[image_url] = page_files #Saves the image url on the 'saves'.dat file
        page_files.close()


def get_page():

    page = ['https://pixabay.com/es/photos/puppy/',
    'https://www.petsworld.in/blog/cute-pictures-of-puppies-and-kittens-together.html',
    'https://pixabay.com/es/photos/bear%20cubs/',
    'https://pixabay.com/es/photos/?q=cute+little+animals&hp=&image_type=photo&order=popular&cat=',
    'https://pixabay.com/es/photos/?q=baby+cows&hp=&image_type=photo&order=popular&cat=',
    'https://www.boredpanda.com/cute-baby-animals/',
    'http://abduzeedo.com/node/74367']
    alr_dow = []

    create_folder()
    create_saves(alr_dow)
    look_images(page, alr_dow)

get_page()

P.S。印刷的信息是西班牙语,与问题无关。请注意,目标是在某个时候将此作为一个严肃的程序,下载严肃的事情。

问题已经解决了。显然,在请求某些页面图像的URL时出现了问题。我没有注意到这个错误,因为我的错误,在look_images函数的末尾:“res.raise_for_status()”应该是“response.raise_for_status()”,因为“response”是存储的变量图片网址的请求。如果更正了,我注意到在页面列表的第二页和最后一页上发出了一个请求错误:由于显然与某些端口和防火墙问题有关的错误,这些站点中的文件未正确下载。这是现在的代码,功能正常,也比第一个更干净:

#! python3

import os, requests, bs4, random, shelve, shutil

def create_folder():
    print('Comprobando pelusitas y pulguitas...')
    os.makedirs('Puppies', exist_ok=True)

def download_image_shutil(url, request_response):

    local_filename = url.split('/')[-1]
    with open(os.path.join('Puppies', local_filename), 'wb') as f:
        shutil.copyfileobj(request_response.raw, f, [False])

    return local_filename

def create_saves(saves_list):
    if os.path.isfile('C:\\Users\\usuario\\Documents\\santi\\saves.dat') == False:
        page_files = shelve.open('saves')
        page_files['saves'] = saves_list
        page_files.close()

def get_random_url(web_object, web_object_number):

    while True:
        try:
            random_number = random.randint(0, web_object_number)
            url_variable = web_object[random_number].get('src')
            if not '.jpg' in str(url_variable):
                continue
            elif str(url_variable) in open('saves.dat', encoding='Latin-1').read():
                continue
            print('Finish')
            print(url_variable)
            return url_variable
            break
        except IndexError:
            print('Algo salió mal: reanundando búsqueda...')
            continue

def look_images(pages_list, saves_list):


    for i in pages_list:
        res = requests.get(i)
        res.raise_for_status()
        soup = bs4.BeautifulSoup(res.text, "html.parser")
        object = soup.select('img')
        object_number = len(object)

        print('\nSe encontraron {} cachorritos potenciales...'.format(int(object_number)))
        print('\nEvaluando colmillitos y patitas...')
        page_files = shelve.open('saves')
        image_url = get_random_url(object, object_number)
        print('\nSe encontraron vauitos...')
        print('\nAdoptando cachorrito...')
        if str(image_url).endswith('.jpg 2x'):
            image_url = str(image_url.replace(' ', '')[:-2])
        if not str(image_url).startswith('https://'):
            image_url = 'https://' + str(image_url)
        response = requests.get(image_url, stream=True)
        response.raise_for_status()
        saves_list.append(image_url)

        download_image_shutil(image_url, response)
        print('¡Cachorrito adoptado!')

        page_files[image_url] = page_files
        page_files.close()


def get_page():

    page = ['https://pixabay.com/es/photos/puppy/',
    'https://www.petsworld.in/blog/cute-pictures-of-puppies-and-kittens-together.html',
    'https://pixabay.com/es/photos/bear%20cubs/',
    'https://pixabay.com/es/photos/?q=cute+little+animals&hp=&image_type=photo&order=popular&cat=',
    'https://pixabay.com/es/photos/?q=baby+cows&hp=&image_type=photo&order=popular&cat=',
    'https://www.boredpanda.com/cute-baby-animals/']
    alr_dow = []

    create_folder()
    create_saves(alr_dow)
    look_images(page, alr_dow)

get_page()

我将一些变量转换为字符串(使用str(x)函数),因为如果没有错误说明:变量上出现NoneType不可迭代。我不明白它来自哪里;如果有人能告诉我,那就太好了。

我对bruno desthuilliers表示感谢,他非常劝我并且有耐心这样做。

0 个答案:

没有答案