url = 'http://www.wired.com/category/science/'
req = urllib.request.Request(url, data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
f = urllib.request.urlopen(req)
html = f.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
title_tag = soup.title
for link in soup.find_all('img'):
alt_data =link.get('alt')
if not alt_data:
alt_data= 'no alt text'
URL=link.get('src')
import uuid
random_value = uuid.uuid1()
IMAGE = URL.rsplit('/', 1)[1]
content =urllib.request.urlretrieve(url, IMAGE)
from django.core.files import File
cntent_typ=ContentType.objects.get(name='image')
obj = Content.objects.create(content_type=cntent_typ, url=url, title=alt_data, text=alt_data, image=File(open(content[0])))
obj.save()
当我尝试手动打开保存的图像文件时。显示:无法加载图片xcxxxcx.jpg 解释JPEG图像文件时出错(不是JPEG文件:以0x3c 0x21开头) 我该如何解决这个问题?
答案 0 :(得分:0)
以下是您的代码的工作示例(减去保存图像)。我改变的只是我在评论中提到的,即使用urllib.parse.urljoin(url, URL)
,但您在评论中提到的错误是因为您的网址有一些Unicode字符。所以我使用from django.utils.encoding import iri_to_uri
正确编码了网址。
import urllib.request
from urllib.parse import urljoin
import uuid
from django.utils.encoding import iri_to_uri
url = 'http://www.wired.com/category/science/'
req = urllib.request.Request(url, data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
f = urllib.request.urlopen(req)
html = f.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
title_tag = soup.title
for link in soup.find_all('img'):
alt_data =link.get('alt')
if not alt_data:
alt_data= 'no alt text'
URL=link.get('src')
random_value = uuid.uuid1()
IMAGE = URL.rsplit('/', 1)[1]
print(urljoin(url, iri_to_uri(URL)))
content =urllib.request.urlretrieve(urljoin(url, iri_to_uri(URL)), IMAGE)