我想使用python下载所有谷歌图片搜索图片。我使用的代码似乎有些问题。我的代码是
import os
import sys
import time
from urllib import FancyURLopener
import urllib2
import simplejson
# Define search term
searchTerm = "parrot"
# Replace spaces ' ' in search term for '%20' in order to comply with request
searchTerm = searchTerm.replace(' ','%20')
# Start FancyURLopener with defined version
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
# Set count to 0
count= 0
for i in range(0,10):
# Notice that the start changes for each iteration in order to request a new set of images for each loop
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0& q='+searchTerm+'&start='+str(i*10)+'&userip=MyIP')
print url
request = urllib2.Request(url, None, {'Referer': 'testing'})
response = urllib2.urlopen(request)
# Get results using JSON
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
# Iterate for each result and get unescaped url
for myUrl in dataInfo:
count = count + 1
my_url = myUrl['unescapedUrl']
myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
下载几页后,我收到如下错误:
追踪(最近一次呼叫最后一次):
File "C:\Python27\img_google3.py", line 37, in <module>
dataInfo = data['results']
TypeError: 'NoneType' object has no attribute '__getitem__'
怎么做??????
答案 0 :(得分:37)
我修改了我的代码。现在代码可以为给定的查询下载100个图像,图像是全高分辨率,即正在下载原始图像。
我正在使用urllib2&amp; amp;美丽的汤
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
import cookielib
import json
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
query = raw_input("query image")# you can change the query for the image here
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print url
#add the directory for your image here
DIR="Pictures"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print "there are total" , len(ActualImages),"images"
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i , (img , Type) in enumerate( ActualImages):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print cntr
if len(Type)==0:
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb')
else :
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print "could not load : "+img
print e
我希望这可以帮助你
答案 1 :(得分:21)
Google Image Search API is deprecated,您需要使用Google Custom Search来实现目标。要获取图像,您需要执行此操作:
import urllib2
import simplejson
import cStringIO
fetcher = urllib2.build_opener()
searchTerm = 'parrot'
startIndex = 0
searchUrl = "http://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + searchTerm + "&start=" + startIndex
f = fetcher.open(searchUrl)
deserialized_output = simplejson.load(f)
这将为您提供4个结果,作为JSON,您需要通过递增API请求中的startIndex
来迭代地获得结果。
要获取图像,您需要使用像cStringIO这样的库。
例如,要访问第一张图片,您需要执行以下操作:
imageUrl = deserialized_output['responseData']['results'][0]['unescapedUrl']
file = cStringIO.StringIO(urllib.urlopen(imageUrl).read())
img = Image.open(file)
答案 2 :(得分:5)
Google弃用他们的API,抓取Google很复杂,所以我建议改用Bing API:
https://datamarket.azure.com/dataset/5BA839F1-12CE-4CCE-BF57-A49D98D29A44
谷歌并不是那么好,微软也不是那么邪恶答案 3 :(得分:5)
这是我最新的使用Selenium和无头Chrome用Python编写的Google图像咆哮者。
它需要python-selenium
,chromium-driver
和来自pip的名为retry
的模块。
链接:http://sam.aiki.info/b/google-images.py
示例用法:
google-images.py tiger 10 --opts isz:lt,islt:svga,itp:photo > urls.txt
parallel=5
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
(i=0; while read url; do wget -e robots=off -T10 --tries 10 -U"$user_agent" "$url" -O`printf %04d $i`.jpg & i=$(($i+1)) ; [ $(($i % $parallel)) = 0 ] && wait; done < urls.txt; wait)
帮助用法:
$ google-images.py --help usage: google-images.py [-h] [--safe SAFE] [--opts OPTS] query n Fetch image URLs from Google Image Search. positional arguments: query image search query n number of images (approx) optional arguments: -h, --help show this help message and exit --safe SAFE safe search [off|active|images] --opts OPTS search options, e.g. isz:lt,islt:svga,itp:photo,ic:color,ift:jpg
代码:
#!/usr/bin/env python3
# requires: selenium, chromium-driver, retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import selenium.common.exceptions as sel_ex
import sys
import time
import urllib.parse
from retry import retry
import argparse
import logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
logger = logging.getLogger()
retry_logger = None
css_thumbnail = "img.Q4LuWd"
css_large = "img.n3VNCb"
css_load_more = ".mye4qd"
selenium_exceptions = (sel_ex.ElementClickInterceptedException, sel_ex.ElementNotInteractableException, sel_ex.StaleElementReferenceException)
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
@retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def get_thumbnails(wd, want_more_than=0):
wd.execute_script("document.querySelector('{}').click();".format(css_load_more))
thumbnails = wd.find_elements_by_css_selector(css_thumbnail)
n_results = len(thumbnails)
if n_results <= want_more_than:
raise KeyError("no new thumbnails")
return thumbnails
@retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def get_image_src(wd):
actual_images = wd.find_elements_by_css_selector(css_large)
sources = []
for img in actual_images:
src = img.get_attribute("src")
if src.startswith("http") and not src.startswith("https://encrypted-tbn0.gstatic.com/"):
sources.append(src)
if not len(sources):
raise KeyError("no large image")
return sources
@retry(exceptions=selenium_exceptions, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def retry_click(el):
el.click()
def get_images(wd, start=0, n=20, out=None):
thumbnails = []
count = len(thumbnails)
while count < n:
scroll_to_end(wd)
try:
thumbnails = get_thumbnails(wd, want_more_than=count)
except KeyError as e:
logger.warning("cannot load enough thumbnails")
break
count = len(thumbnails)
sources = []
for tn in thumbnails:
try:
retry_click(tn)
except selenium_exceptions as e:
logger.warning("main image click failed")
continue
sources1 = []
try:
sources1 = get_image_src(wd)
except KeyError as e:
pass
# logger.warning("main image not found")
if not sources1:
tn_src = tn.get_attribute("src")
if not tn_src.startswith("data"):
logger.warning("no src found for main image, using thumbnail")
sources1 = [tn_src]
else:
logger.warning("no src found for main image, thumbnail is a data URL")
for src in sources1:
if not src in sources:
sources.append(src)
if out:
print(src, file=out)
out.flush()
if len(sources) >= n:
break
return sources
def google_image_search(wd, query, safe="off", n=20, opts='', out=None):
search_url_t = "https://www.google.com/search?safe={safe}&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img&tbs={opts}"
search_url = search_url_t.format(q=urllib.parse.quote(query), opts=urllib.parse.quote(opts), safe=safe)
wd.get(search_url)
sources = get_images(wd, n=n, out=out)
return sources
def main():
parser = argparse.ArgumentParser(description='Fetch image URLs from Google Image Search.')
parser.add_argument('--safe', type=str, default="off", help='safe search [off|active|images]')
parser.add_argument('--opts', type=str, default="", help='search options, e.g. isz:lt,islt:svga,itp:photo,ic:color,ift:jpg')
parser.add_argument('query', type=str, help='image search query')
parser.add_argument('n', type=int, default=20, help='number of images (approx)')
args = parser.parse_args()
opts = Options()
opts.add_argument("--headless")
# opts.add_argument("--blink-settings=imagesEnabled=false")
with webdriver.Chrome(options=opts) as wd:
sources = google_image_search(wd, args.query, safe=args.safe, n=args.n, opts=args.opts, out=sys.stdout)
main()
答案 4 :(得分:3)
Haven未查看您的代码,但这是使用selenium尝试从搜索字词中获取400张图片的示例解决方案
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import os
import urllib2
searchterm = 'vannmelon' # will also be the name of the folder
url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"
browser = webdriver.Firefox()
browser.get(url)
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for _ in range(500):
browser.execute_script("window.scrollBy(0,10000)")
for x in browser.find_elements_by_xpath("//div[@class='rg_meta']"):
counter = counter + 1
print "Total Count:", counter
print "Succsessful Count:", succounter
print "URL:",json.loads(x.get_attribute('innerHTML'))["ou"]
img = json.loads(x.get_attribute('innerHTML'))["ou"]
imgtype = json.loads(x.get_attribute('innerHTML'))["ity"]
try:
req = urllib2.Request(img, headers={'User-Agent': header})
raw_img = urllib2.urlopen(req).read()
File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb")
File.write(raw_img)
File.close()
succounter = succounter + 1
except:
print "can't get img"
print succounter, "pictures succesfully downloaded"
browser.close()
答案 5 :(得分:2)
添加到Piees's answer,要从搜索结果中下载任意数量的图片,我们需要在加载前400个结果后模拟点击“显示更多结果”按钮。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import json
import urllib2
import sys
import time
# adding path to geckodriver to the OS environment variable
# assuming that it is stored at the same path as this script
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = "dataset/"
def main():
searchtext = sys.argv[1] # the search query
num_requested = int(sys.argv[2]) # number of images to download
number_of_scrolls = num_requested / 400 + 1
# number_of_scrolls * 400 images will be opened in the browser
if not os.path.exists(download_path + searchtext.replace(" ", "_")):
os.makedirs(download_path + searchtext.replace(" ", "_"))
url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
driver = webdriver.Firefox()
driver.get(url)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
extensions = {"jpg", "jpeg", "png", "gif"}
img_count = 0
downloaded_img_count = 0
for _ in xrange(number_of_scrolls):
for __ in xrange(10):
# multiple scrolls needed to show all 400 images
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
# to load next 400 images
time.sleep(0.5)
try:
driver.find_element_by_xpath("//input[@value='Show more results']").click()
except Exception as e:
print "Less images found:", e
break
# imges = driver.find_elements_by_xpath('//div[@class="rg_meta"]') # not working anymore
imges = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
print "Total images:", len(imges), "\n"
for img in imges:
img_count += 1
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_type = json.loads(img.get_attribute('innerHTML'))["ity"]
print "Downloading image", img_count, ": ", img_url
try:
if img_type not in extensions:
img_type = "jpg"
req = urllib2.Request(img_url, headers=headers)
raw_img = urllib2.urlopen(req).read()
f = open(download_path+searchtext.replace(" ", "_")+"/"+str(downloaded_img_count)+"."+img_type, "wb")
f.write(raw_img)
f.close
downloaded_img_count += 1
except Exception as e:
print "Download failed:", e
finally:
print
if downloaded_img_count >= num_requested:
break
print "Total downloaded: ", downloaded_img_count, "/", img_count
driver.quit()
if __name__ == "__main__":
main()
完整代码为here。
答案 6 :(得分:1)
你也可以使用Selenium和Python。方法如下:
from selenium import webdriver
import urllib
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome('C:/Python27/Scripts/chromedriver.exe')
word="apple"
url="http://images.google.com/search?q="+word+"&tbm=isch&sout=1"
driver.get(url)
imageXpathSelector='//*[@id="ires"]/table/tbody/tr[1]/td[1]/a/img'
img=driver.find_element_by_xpath(imageXpathSelector)
src=(img.get_attribute('src'))
urllib.urlretrieve(src, word+".jpg")
driver.close()
(此代码适用于Python 2.7) 请注意,您应该安装Selenium包装,并使用 pip install selenium &#39;你应该从here
下载chromedriver.exe与其他网络抓取技术相反,Selenium打开浏览器并下载项目,因为Selenium的任务是测试而不是抓取。
答案 7 :(得分:1)
与其他代码段一样,这个代码已经变老,不再对我有用。根据上述解决方案之一,每个关键字下载100张图片。
from bs4 import BeautifulSoup
import urllib2
import os
class GoogleeImageDownloader(object):
_URL = "https://www.google.co.in/search?q={}&source=lnms&tbm=isch"
_BASE_DIR = 'GoogleImages'
_HEADERS = {
'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
}
def __init__(self):
query = raw_input("Enter keyword to search images\n")
self.dir_name = os.path.join(self._BASE_DIR, query.split()[0])
self.url = self._URL.format(urllib2.quote(query))
self.make_dir_for_downloads()
self.initiate_downloads()
def make_dir_for_downloads(self):
print "Creating necessary directories"
if not os.path.exists(self._BASE_DIR):
os.mkdir(self._BASE_DIR)
if not os.path.exists(self.dir_name):
os.mkdir(self.dir_name)
def initiate_downloads(self):
src_list = []
soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(self.url,headers=self._HEADERS)),'html.parser')
for img in soup.find_all('img'):
if img.has_attr("data-src"):
src_list.append(img['data-src'])
print "{} of images collected for downloads".format(len(src_list))
self.save_images(src_list)
def save_images(self, src_list):
print "Saving Images..."
for i , src in enumerate(src_list):
try:
req = urllib2.Request(src, headers=self._HEADERS)
raw_img = urllib2.urlopen(req).read()
with open(os.path.join(self.dir_name , str(i)+".jpg"), 'wb') as f:
f.write(raw_img)
except Exception as e:
print ("could not save image")
raise e
if __name__ == "__main__":
GoogleeImageDownloader()
答案 8 :(得分:0)
我知道这个问题已经过时了,但我最近遇到了这个问题,之前的答案都没有了。所以我写了这个脚本来收集谷歌的图像。截至目前,它可以下载尽可能多的图像。
这里有一个github链接,https://github.com/CumminUp07/imengine/blob/master/get_google_images.py
免责声明:由于版权问题,所收集的图片仅供研究和教育用途
from bs4 import BeautifulSoup as Soup
import urllib2
import json
import urllib
#programtically go through google image ajax json return and save links to list#
#num_images is more of a suggestion #
#it will get the ceiling of the nearest 100 if available #
def get_links(query_string, num_images):
#initialize place for links
links = []
#step by 100 because each return gives up to 100 links
for i in range(0,num_images,100):
url = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q='+query_string+'\
&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start='+str(i)+'\
&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s'
#set user agent to avoid 403 error
request = urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0'})
#returns json formatted string of the html
json_string = urllib2.urlopen(request).read()
#parse as json
page = json.loads(json_string)
#html found here
html = page[1][1]
#use BeautifulSoup to parse as html
new_soup = Soup(html,'lxml')
#all img tags, only returns results of search
imgs = new_soup.find_all('img')
#loop through images and put src in links list
for j in range(len(imgs)):
links.append(imgs[j]["src"])
return links
#download images #
#takes list of links, directory to save to #
#and prefix for file names #
#saves images in directory as a one up number #
#with prefix added #
#all images will be .jpg #
def get_images(links,directory,pre):
for i in range(len(links)):
urllib.urlretrieve(links[i], "./"+directory+"/"+str(pre)+str(i)+".jpg")
#main function to search images #
#takes two lists, base term and secondary terms #
#also takes number of images to download per #
#combination #
#it runs every combination of search terms #
#with base term first then secondary #
def search_images(base,terms,num_images):
for y in range(len(base)):
for x in range(len(terms)):
all_links = get_links(base[y]+'+'+terms[x],num_images)
get_images(all_links,"images",x)
if __name__ == '__main__':
terms = ["cars","numbers","scenery","people","dogs","cats","animals"]
base = ["animated"]
search_images(base,terms,1000)
答案 9 :(得分:0)
尝试使用其他图片搜索(例如 ecosia
或 bing
)代替谷歌图片搜索。
这是从 ecosia
搜索引擎检索图像的示例代码。
from bs4 import BeautifulSoup
import requests
import urllib
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers = {'User-Agent':user_agent}
urls = ["https://www.ecosia.org/images?q=india%20pan%20card%20example"]
#The url's from which the image is to be extracted.
index = 0
for url in urls:
request = urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
data = response.read() # Read the html result page
soup = BeautifulSoup(data, 'html.parser')
for link in soup.find_all('img'):
#The images are enclosed in 'img' tag and the 'src' contains the url of the image.
img_url = link.get('src')
dest = str(index) + ".jpg" #Destination to store the image.
try:
urllib.request.urlretrieve(img_url)
index += 1
except:
continue
该代码适用于谷歌图像搜索,但无法检索图像,因为谷歌以加密格式存储图像,很难从图像 URL 中检索。
解决方案于 2021 年 2 月 1 日生效。
答案 10 :(得分:0)
selenium
来实现这一点。import os, time, shutil, httpx, asyncio
from urllib.parse import urlparse
from serpapi import GoogleSearch
# https://stackoverflow.com/a/39217788/1291371
async def download_file(url):
print(f'Downloading {url}')
# https://stackoverflow.com/a/18727481/1291371
parsed_url = urlparse(url)
local_filename = os.path.basename(parsed_url.path)
os.makedirs('images', exist_ok=True)
async with httpx.AsyncClient() as client:
async with client.stream('GET', url) as response:
async with open(f'images/{local_filename}', 'wb') as f:
await asyncio.to_thread(shutil.copyfileobj, response.raw, f)
return local_filename
async def main():
start = time.perf_counter()
params = {
"engine": "google",
"ijn": "0",
"q": "lasagna",
"tbm": "isch",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
download_files_tasks = [
download_file(image['original']) for image in results['images_results']
]
await asyncio.gather(*download_files_tasks, return_exceptions=True)
print(
f"Downloaded {len(download_files_tasks)} images in {time.perf_counter() - start:0.4f} seconds")
asyncio.run(main())
<块引用>
免责声明,我为 SerpApi 工作。
答案 11 :(得分:0)
好的,所以不是从你那里编码,我会告诉你你做错了什么,它可能会引导你走向正确的方向。通常,大多数现代网站通过 javascript 动态呈现 html,因此如果您只是发送 GET 请求(使用 urllib/CURL/fetch/axios),您将无法获得通常在浏览器中看到的相同 URL/网址的内容。您需要的是呈现 javascript 代码以创建与您在浏览器上看到的相同的 HTML/网页的东西,您可以使用类似 selenium gecko 驱动程序的 firefox 驱动程序来执行此操作,并且有 Python 模块可以让您执行此操作。>
我希望这会有所帮助,如果您仍然感到迷茫,这里有一个简单的脚本,我不久前写了一个脚本,用于从您的谷歌照片中提取类似的内容
from selenium import webdriver
import re
url="https://photos.app.goo.gl/xxxxxxx"
driver = webdriver.Firefox()
driver.get(url)
regPrms="^background-image\:url\(.*\)$"
regPrms="^The.*Spain$"
html = driver.page_source
urls=re.findall("(?P<url>https?://[^\s\"$]+)", html)
fin=[]
for url in urls:
if "video-downloads" in url:
fin.append(url)
print("The Following ZIP contains all your pictures")
for url in fin:
print("-------------------")
print(url)