Python Webdriver多线程

时间:2016-06-03 13:12:11

标签: python selenium-webdriver python-multithreading

我尝试使用以下代码生成多个webdriver实例:http://www.ibm.com/developerworks/aix/library/au-threadingpython/

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
    "http://ibm.com", "http://apple.com"]
queue = Queue.Queue
out_queue = Queue.Queue

class Login_Driver(threading.Thread):
    def __init__(self, queue, out_queue, driver):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue
        self.driver = driver
        print driver.title
    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()
            #grabs urls of hosts and then grabs chunk of webpage
            driver.get(host)
            chunk = driver.page_source()
            #place chunk into out queue
            self.out_queue.put(chunk)
            #signals to queue job is done
            self.queue.task_done()
class Poster(threading.Thread):
    def __init__(self, driver, out_queue):
        self.out_queue = out_queue
        self.driver = driver
        print driver.name
    def run(self):
        while True:
            #grabs host from queue
            chunk = self.out_queue.get()
            #parse the chunk
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #signals to queue job is done
            self.out_queue.task_done()
start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        driver = webdriver.Firefox()
        t = Login_Driver(queue, out_queue, driver)
        t.setDaemon(True)
        t.start()
        time.sleep(20)
    #populate queue with data
    for host in hosts:
        queue.put(host)
    for i in range(5):
        dt = Poster(out_queue)
        dt.setDaemon(True)
        dt.start()
    #wait on the queue until everything has been processed
    queue.join()
    out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)

它出错:TypeError:必须使用Queque实例作为第一个参数调用未绑定方法get()(没有取而代之)

我是关于线程,类,进程的新手,你能告诉我什么更好用,线程或进程,如果可以给我一个例子会很棒。 谢谢你们。

更新

工作代码:

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup

hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
        "http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
out_queue = Queue.Queue()

class Login_Driver(threading.Thread):
#def __init__(self, driver):
    def __init__(self, queue, out_queue, driver):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue
        self.driver = driver
        print "In init first class.."
    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()
            #grabs urls of hosts and then grabs chunk of webpage
            self.driver.get(host)
            chunk = self.driver.page_source
            #place chunk into out queue
            self.out_queue.put(chunk)
            #signals to queue job is done
            print self.driver.title
            self.queue.task_done()
class Poster(threading.Thread):
    def __init__(self, out_queue, driver):
        threading.Thread.__init__(self)
        self.out_queue = out_queue
        self.driver = driver
        print "In init a second class.."
    def run(self):
        while True:
            #grabs host from queue
            chunk = self.out_queue.get()
            #parse the chunk
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #signals to queue job is done
            print self.driver.name
            self.out_queue.task_done()
start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        driver = webdriver.Firefox()
        t = Login_Driver(queue, out_queue, driver)
        t.setDaemon(True)
        t.start()
        print "Started webdriver: --- "+str(i)+" --- from main"
    print "All started"
    time.sleep(3)
    #populate queue with data
    for host in hosts:
        queue.put(host)
        print "Opening website: "+host
    print "All sites passed for opening.."
    time.sleep(3)
    for i in range(5):
        dt = Poster(out_queue, driver)
        dt.setDaemon(True)
        dt.start()
        print "Starting second class/title and name beautifull soup and webdriver: --- "+str(i)+" --- from main"
    print "Started secound class.."
    time.sleep(3)
    #wait on the queue until everything has been processed
    queue.join()
    out_queue.join()
    print "out_queue.join()"
main()
print "Elapsed Time: %s" % (time.time() - start)

2 个答案:

答案 0 :(得分:2)

您没有正确实例化队列。而不是,

queue = Queue.Queue
out_queue = Queue.Queue

应该是

queue = Queue.Queue()
out_queue = Queue.Queue()

答案 1 :(得分:2)

您需要使用Queue.Queue()代替Queue.Queue