我使用Python的selenium web驱动程序(Chrome)抓取图像
我可以使用多个驱动程序并让每个驱动程序抓取图像吗?
我希望通过多次处理来完成以下事项
def crawl(searchText):
driver = webdriver.Chrome('C:\\Users\\HYOWON\\Desktop\\Desktop\\Graduation\\Code\\Crawling\\chromedriver.exe')
searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)
driver.get(searchUrl)
imgs_urls = [] # Url 저장 배열
cnt = 0
for j in range(20):
element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
element.click()
sleep(1)
soup = create_soup()
for img in soup.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except:
pass
driver.close()
return(imgs_urls)
def crawl():
imgs_urls = []
for j in range(50):
element1 = driver1.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element2 = driver2.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element3 = driver3.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element1.click()
WebDriverWait(driver1, 1)
soup1 = create_soup(driver1)
for img in soup1.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'): # http로 시작 jpg로 끝나는것만
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element2.click()
WebDriverWait(driver2, 1)
soup2 = create_soup(driver2)
for img in soup2.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element3.click()
WebDriverWait(driver3, 1)
soup3 = create_soup(driver3)
for img in soup3.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
cnt += 3
return (imgs_urls)
def download_img(url, filename):
full_name = str(filename) + ".jpg"
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
for url in crawl():
download_img(url, filename)
答案 0 :(得分:0)
在这个例子中,我只是单独声明驱动程序对象,虽然我个人想把它们放到某种数组中以便更容易地引用它们,这样你就可以遍历它们。当然,这会使你的代码结构有所不同,尽管你不应该在这里遇到太多问题。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
baseURL_1 = "http://www.stackoverflow.com/"
baseURL_2 = "http://www.google.com/"
def main():
init()
initialPage()
return
def init():
global drv1
global drv2
chromedrvPath = "C:\\path_to_chrome\\chromedriver.exe"
opt = webdriver.ChromeOptions()
opt.add_experimental_option('prefs', {
'credentials_enable_service': False,
'profile': {
'password_manager_enabled': False
}
})
drv1 = webdriver.Chrome(chromedrvPath,chrome_options=opt)
drv2 = webdriver.Chrome(chromedrvPath,chrome_options=opt)
return
def initialPage():
navigate(baseURL_1,1)
navigate(baseURL_2,2)
return
def navigate(URL,d):
if(d == 1):
drv1.get(URL)
if(d == 2):
drv2.get(URL)
return
if __name__ == "__main__":
main()