我正在尝试为每个页面启动一个新线程,但是这样,它将在另一个线程/功能完成后启动一个新线程。 谁能帮助我彼此独立地运行它们? 例: 线程1: 打开第1页 线程2: 打开第2页
然后对X个页面进行此操作。 我是python的初学者,请原谅我凌乱的代码。
import random
import string
import threading
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
# driver.find_element_by_css_selector("a[onclick*='if (!window.__cfRLUnblockHandlers) return false; bail()']")
def randomStringDigits(stringLength=6):
"""Generate a random string of letters and digits """
lettersAndDigits = string.ascii_letters + string.digits
return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
def startscrape(url):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.get(page)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xxx = soup.findAll("a", {"class": "js-lbImage"})
# find all thumbs
for link in xxx:
xxx = soup.find("a", {"href": link.get('href')})
dlfullimg = driver.find_element_by_xpath("//a[@href='" + xxx.get('href') + "']")
wait = WebDriverWait(driver, 10)
dlfullimg.click()
thumbs = soup.findAll("div", {"class": "lg-thumb-item"})
dlfullimg = driver.find_element_by_id('lg-download').click()
close = driver.find_element_by_xpath("//span[@class='lg-close lg-icon']").click()
sleep(1)
assert "No results found." not in driver.page_source
url = input("Main URL: ")
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find page number with soup.find
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.close()
threads = []
for i in range(int(xx.get('max'))):
page = url + "page-" + str(i + 1)
t = threading.Thread(target=startscrape(url), args=[])
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
答案 0 :(得分:0)
您可以使用current.futures为您处理繁重的工作
这里是伪代码
import concurrent.futures
from selenium import webdriver
def process_url(url):
driver = webdriver.Chrome()
driver.get(url)
# process page
driver.close
# Find number of pages here
driver = webdriver.Chrome()
driver.get(url)
# urls = find list of urls
driver.close
threads_count = 10
with concurrent.futures.ThreadPoolExecutor(threads_count) as executor:
executor.map(process_url, urls)