我正在尝试使用以下代码下载图像,但出现错误
from bs4 import BeautifulSoup
import requests
import re
import urllib
import urllib.request as ur
import os
import http.cookiejar as cookielib
import json
def get_soup(url,header):
return BeautifulSoup(ur.urlopen(ur.Request(url,headers=header)),'html.parser')
query = 'apple' #you can change the query for the image here
image_type="ActiOn" query= query.split() query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print (url)
#add the directory for your image here
DIR="/Users/jashuvadoma/Desktop/hacking/images"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[] # contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"], json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print ("there are total" , len(ActualImages),"images")
if not os.path.exists(DIR):
os.mkdir(DIR) DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i , (img , Type) in enumerate( ActualImages):
try:
req = ur.Request(img, headers={'User-Agent' : header})
raw_img = ur.urlopen(req).read()
cntr = lea([i for i in os.listdir(DIR) if image_type in i]) + 1
print (cntr)
if len(Type)==0:
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb')
else :
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print ("could not load : "+img)
print (e)
错误如下: https://www.google.co.in/search?q=apple&source=lnms&tbm=isch 共有100张图片 无法加载:https://www.apple.com/ac/structured-data/images/knowledge_graph_logo.png?201606271147 预期的字符串或类似字节的对象
答案 0 :(得分:0)
错误清楚地表明某些参数需要一个字符串值,但传递了其他一些参数。
在发布问题之前,您应该尝试自己调试。您可以尝试的几件事:
/usr/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked) 1278 1279 for hdr, value in headers.items(): -> 1280 self.putheader(hdr, value) 1281 if isinstance(body, str): 1282 # RFC 2616 Section 3.7.1 says that text default has a /usr/lib/python3.6/http/client.py in putheader(self, header, *values) 1214 values[i] = str(one_value).encode('ascii') 1215 -> 1216 if _is_illegal_header_value(values[i]): 1217 raise ValueError('Invalid header value %r' % (values[i],)) 1218 TypeError: expected string or bytes-like object
现在查看踪迹,似乎标头值是错误的。
在“打印图像”部分中,更改标题如下:
req = ur.Request(img, headers=header)