我正在编写一个抓取工具,以快速从Google Image下载图片。
为了确保该操作足够快速处理大量图像,我使用了multiprocessing
Python软件包。
URL列表的每个成员都传递给下载图像的函数。所有这些都是使用Pool
在进程apply_async
中完成的。
但是,例如,当下载200张图像时,该池将永远挂在〜197/200,并且永远不会join
秒。
这是代码-行中有问题。我试图写一个尽可能少的例子。要获取> 100张图像,必须使用滚动功能,否则脚本可以正常工作。
from selenium import webdriver
from six.moves import urllib
from multiprocessing import Pool
import tqdm
import time
import json
import sys
import os
# Config
download_img_path = "test/"
req_header = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
def write_img_file(img_item):
req = urllib.request.Request(img_item)
req.add_header('User-Agent', req_header)
urllib.request.urlopen(req)
return
def get_images(driver, folder_path, num):
images = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
img_list = [json.loads(images[i].get_attribute('innerHTML'))["ou"] for i in range(min(len(images), num))]
pbar = tqdm.tqdm(total=len(img_list))
def update(*a):
pbar.update()
pool = Pool()
for i in range(pbar.total):
pool.apply_async(write_img_file, args=(img_list[i], ), callback=update)
pool.close()
pool.join()
del pool
def scroll(driver, num_scrolls):
for _ in range(num_scrolls):
for __ in range(10):
# scrolls to show all 400 images
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
# click "show more results"
time.sleep(0.5)
try:
driver.find_element_by_xpath(
"//input[@value='Plus de résultats']") .click()
time.sleep(0.5)
except Exception as e:
print(" show more results failed -> exception: " + str(e))
def search(search_txt, num):
if not os.path.exists(download_img_path):
os.makedirs(download_img_path)
url = "https://www.google.co.in/search?q=" + \
search_txt + "&source=lnms&tbm=isch"
driver = webdriver.Chrome(
executable_path=r"/usr/lib/chromium/chromedriver")
driver.get(url)
num_scrolls = int(num / 400 + 1)
scroll(driver, num_scrolls)
get_images(driver, download_img_path, num)
search("hotdog", 200)
print('DONE')
sys.exit()