Question

我正在编写一个抓取工具，以快速从Google Image下载图片。为了确保该操作足够快速处理大量图像，我使用了multiprocessing Python软件包。

URL列表的每个成员都传递给下载图像的函数。所有这些都是使用Pool在进程apply_async中完成的。但是，例如，当下载200张图像时，该池将永远挂在〜197/200，并且永远不会join秒。

这是代码-行中有问题。我试图写一个尽可能少的例子。要获取> 100张图像，必须使用滚动功能，否则脚本可以正常工作。

from selenium import webdriver
from six.moves import urllib
from multiprocessing import Pool
import tqdm
import time
import json
import sys
import os

# Config
download_img_path = "test/"
req_header = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"


def write_img_file(img_item):
    req = urllib.request.Request(img_item)
    req.add_header('User-Agent', req_header)
    urllib.request.urlopen(req)
    return


def get_images(driver, folder_path, num):
    images = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
    img_list = [json.loads(images[i].get_attribute('innerHTML'))["ou"] for i in range(min(len(images), num))]

    pbar = tqdm.tqdm(total=len(img_list))
    def update(*a):
        pbar.update()

    pool = Pool()
    for i in range(pbar.total):
        pool.apply_async(write_img_file, args=(img_list[i], ), callback=update)
    pool.close()
    pool.join()
    del pool

def scroll(driver, num_scrolls):

    for _ in range(num_scrolls):
        for __ in range(10):
            # scrolls to show all 400 images
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        # click "show more results"
        time.sleep(0.5)
        try:
            driver.find_element_by_xpath(
                "//input[@value='Plus de résultats']") .click()
            time.sleep(0.5)
        except Exception as e:
            print("    show more results failed -> exception: " + str(e))


def search(search_txt, num):

    if not os.path.exists(download_img_path):
        os.makedirs(download_img_path)

    url = "https://www.google.co.in/search?q=" + \
        search_txt + "&source=lnms&tbm=isch"
    driver = webdriver.Chrome(
        executable_path=r"/usr/lib/chromium/chromedriver")
    driver.get(url)
    num_scrolls = int(num / 400 + 1)
    scroll(driver, num_scrolls)

    get_images(driver, download_img_path, num)


search("hotdog", 200)

print('DONE')
sys.exit()

多处理-进程池挂起连接

0 个答案: