我使用Python 2.7创建了一个网络爬虫,并且尝试下载JPEG格式的一些漫画。一切正常,直到看到下载的图像。一条消息说图像已损坏或太大,但只有约100 kB。所有链接均已检查且正确。所有路径也是如此。我可以看到已创建文件夹和文件,但是当我打开jpg时,除了错误消息外,什么都没有。
这是我的代码:
import requests
from bs4 import BeautifulSoup
import os
import urllib
def manga_crawl(from_manga, to_manga):
manga = from_manga
url = 'https://www.mangareader.net/one-piece/'
while manga <= to_manga:
url = url + str(manga) + '/'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
#print "URL-> " + url
path_name = create_folder(manga)
#print "FOLDER-> " + path_name
pages = find_manga_pages(soup)
#print "PAGES-> " + pages
download_jpg(pages, url, path_name)
manga = manga + 1
url = 'https://www.mangareader.net/one-piece/'
def create_folder(manga):
pathname = 'one-piece-' + str(manga)
os.makedirs(pathname)
return pathname
def find_manga_pages(soup):
for opt in soup.find_all('option'):
counter = opt.text
return counter
def download_jpg(pages, url, path_name):
page = 1
while page <= int(pages):
thisurl = url + str(page)
#print "THIS URL->" + str(thisurl)
source_code = requests.get(thisurl)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
urlsoup = soup.find('img', {'id': 'img'})
iconurl = str(urlsoup['src'])
this_path_name = path_name + '/' + str(page) + '.jpg'
print "ICON URL->" + iconurl
urllib.urlretrieve(iconurl, this_path_name)
page = page + 1
def main():
x = raw_input()
y = raw_input()
manga_crawl(int(x), int(y))
if __name__ == "__main__":
main()
有什么建议吗?
答案 0 :(得分:1)
好吧,我改变了这一行
urllib.urlretrieve(iconurl, this_path_name)
与这些
response = requests.get(iconurl, stream=True)
with open(this_path_name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
效果很好!