我已经编写了一个python脚本来打开arround 1k url并处理它们以获得所需的结果,但是看起来好像多线程已经被引入其工作缓慢,并且在处理了一些网址之后,这个过程似乎被挂起了,我无法决定它是否仍在运行或停止。如何创建多个线程来更快地处理它们。任何帮助都将受到高度赞赏。谢谢你们。以下是我的剧本。
import threading
from multiprocessing.pool import ThreadPool
from selenium import webdriver
from selenium.webdriver.phantomjs.service import Service
from selenium.webdriver.common.desired_capabilities import
DesiredCapabilities
from selenium.webdriver.remote.webdriver import WebDriver as
RemoteWebDriver
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
import csv
def fetch_url(url):
driver = webdriver.PhantomJS()
driver.get(url)
html = driver.page_source
print(html)
print("'%s\' fetched in %ss" % (url[0], (time.time() - start)))
def thread_task(lock,data_set):
lock.acquire()
fetch_url(url)
lock.release()
if __name__ == "__main__":
data_set = []
with open('file.csv', 'r') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
data_set.append(row)
lock = threading.Lock()
# data set will contain a list of 1k urls
for url in data_set:
t1 = threading.Thread(target=thread_task, args=(lock,url,))
# start threads
t1.start()
# wait until threads finish their job
t1.join()
print("Elapsed Time: %s" % (time.time() - start))
答案 0 :(得分:1)
您首先通过等待每个线程在for url in data_set:
循环中完成然后开始下一个,然后通过使用锁来仅让fetch_url
的一个实例来击败多线程功能一次运行。您导入了ThreadPool
,这是一个合理的工具。以下是如何使用它
import threading
from multiprocessing.pool import ThreadPool
from selenium import webdriver
from selenium.webdriver.phantomjs.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
import csv
def fetch_url(url):
driver = webdriver.PhantomJS()
driver.get(url)
html = driver.page_source
print(html)
print("'%s\' fetched in %ss" % (url[0], (time.time() - start)))
def thread_task(lock,data_set):
lock.acquire()
fetch_url(url)
lock.release()
if __name__ == "__main__":
start = time.time()
with open('file.csv', 'r') as csvfile:
dataset = list(csv.reader(csvfile, delimiter=' ', quotechar='|'))
# guess a thread pool size which is a tradeoff of number of cpu cores,
# expected wait time for i/o and memory size.
with ThreadPool(20) as pool:
pool.map(fetch_url, dataset, chunksize=1)
print("Elapsed Time: %s" % (time.time() - start))