以下是我用于从Google图片下载图片的代码:
def get_soup(url, header):
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url, headers=header)), 'html.parser')
def get_images_for_term(text, width, height, number_of_images=3):
search_keywords = ''
collect_valid_urls = []
query = text
if not (height and width) == '1':
keyword_keys = ' high quality background imagesize:' + width + 'x' + height
else:
keyword_keys = ' high quality background'
query = query + keyword_keys
max_images = number_of_images
try:
if not ((height == width) and (height and width) == '1'):
search_keywords = text + '_' + height + 'by' + width
else:
search_keywords = text
os.makedirs(search_keywords)
except OSError as e:
if e.errno != 17:
raise
pass
query = query.split()
query = '+'.join(query)
url = "google Url"
header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/43.0.2357.134 Safari/537.36"}
soup = get_soup(url, header)
ActualImages = []
for a in soup.find_all("div", {"class": "rg_meta"}):
link = json.loads(a.text)["ou"]
ActualImages.append(link)
Stop_Counter = 0
for i, img in enumerate(ActualImages):
try:
print(img)
req = urllib.request.Request(img, headers={'User-Agent': header})
raw_img = urllib.request.urlopen(req, None, 15).read()
print(raw_img[0])
if raw_img[0] != '<':
image = np.asarray(bytearray(raw_img), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
try:
blur_map, score, blurry = estimate_blur(image)
except IOError and AttributeError:
blurry = False
if not blurry:
collect_valid_urls.append(img)
Output_file = open(search_keywords + "/" + str(Stop_Counter + 1) + ".jpg", 'wb')
Output_file.write(raw_img)
Output_file.close()
Stop_Counter += 1
if Stop_Counter == max_images:
print("Done with downloading the images")
break
else:
print("Image is Blurry.")
except Exception as e:
print("could not load : " + img)
print(e)
get_images_for_term('cats', '200', '300')
我收到以下错误:
https://i.pinimg.com/736x/ec/65/86/ec658681dada104797b3f1f49026c7f1--cat-wallpaper-iphone-wallpaper.jpg
could not load : https://i.pinimg.com/736x/ec/65/86/ec658681dada104797b3f1f49026c7f1--cat-wallpaper-iphone-wallpaper.jpg
expected string or bytes-like object
请帮助我改进代码,以免再次给我带来任何错误。
答案 0 :(得分:1)
urlopen接受字符串网址作为参数,您应该直接将链接传递给:
raw_img = urllib.request.urlopen(img, timeout=15).read()
或修复请求对象,而不在headers
的新词典中声明前一个标头:
req = urllib.request.Request(img, headers=header) raw_img = urllib.request.urlopen(req, None, 15).read()