Python selenium web驱动程序多处理

时间:2017-05-14 15:13:29

标签: python-3.x selenium

我使用Python的selenium web驱动程序(Chrome)抓取图像

我可以使用多个驱动程序并让每个驱动程序抓取图像吗?

我希望通过多次处理来完成以下事项

源代码

def crawl(searchText):
  driver = webdriver.Chrome('C:\\Users\\HYOWON\\Desktop\\Desktop\\Graduation\\Code\\Crawling\\chromedriver.exe')

  searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)

  driver.get(searchUrl)

  imgs_urls = []  # Url 저장 배열
  cnt = 0

  for j in range(20):
    element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
      element.click()
      sleep(1)

      soup = create_soup() 

      for img in soup.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):  
                  imgs_urls.append(img['src'])
          except:  
              pass

  driver.close()
  return(imgs_urls)

修改代码

 def crawl():
    imgs_urls = []
    for j in range(50):
      element1 = driver1.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
      element2 = driver2.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
      element3 = driver3.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")

      element1.click()
      WebDriverWait(driver1, 1)
      soup1 = create_soup(driver1)

      for img in soup1.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):  # http로 시작 jpg로 끝나는것만
                imgs_urls.append(img['src'])
          except:  # 예외 pass
              pass

      element2.click()
      WebDriverWait(driver2, 1)
      soup2 = create_soup(driver2)

      for img in soup2.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):
                imgs_urls.append(img['src'])
          except:  # 예외 pass
              pass

      element3.click()
      WebDriverWait(driver3, 1)
      soup3 = create_soup(driver3)


      for img in soup3.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):
                imgs_urls.append(img['src'])
          except:  # 예외 pass
              pass

      cnt += 3

  return (imgs_urls)

def download_img(url, filename):
  full_name = str(filename) + ".jpg"
  urllib.request.urlretrieve(url, 'C:/Python/' + full_name)

for url in crawl():
  download_img(url, filename)

1 个答案:

答案 0 :(得分:0)

确实你可以!我一直在考虑为我当前正在开展工作的项目使用多驱动程序解决方案。

在这个例子中,我只是单独声明驱动程序对象,虽然我个人想把它们放到某种数组中以便更容易地引用它们,这样你就可以遍历它们。当然,这会使你的代码结构有所不同,尽管你不应该在这里遇到太多问题。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

baseURL_1 = "http://www.stackoverflow.com/"
baseURL_2 = "http://www.google.com/"

def main():
    init()
    initialPage()
    return

def init():
    global drv1
    global drv2

    chromedrvPath = "C:\\path_to_chrome\\chromedriver.exe"
    opt = webdriver.ChromeOptions()
    opt.add_experimental_option('prefs', {
        'credentials_enable_service': False,
        'profile': {
            'password_manager_enabled': False
        }
    })
    drv1 = webdriver.Chrome(chromedrvPath,chrome_options=opt)
    drv2 = webdriver.Chrome(chromedrvPath,chrome_options=opt)

    return

def initialPage():
    navigate(baseURL_1,1)
    navigate(baseURL_2,2)
    return

def navigate(URL,d):
    if(d == 1):
        drv1.get(URL)
    if(d == 2):
        drv2.get(URL)
    return

if __name__ == "__main__":
    main()