Question

所以我创建了一个带有selenium的web scraper，它可以无限地抓取一个网页。我正在尝试创建这个刮刀的两个实例并将它们并行运行，以便同时刮取网站的两个不同部分（或完全两个不同的网站）。使用我当前的代码，两个进程都启动，两个chrome实例启动，但只有一个实际开始抓取。另一个只是坐在着陆页上，永远不会移动。我目前的刮刀类看起来像这样

class clBot(Scraper):

def __init__(self, light_or_dark):
    light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
                              '//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
    dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
    if light_or_dark == "light":
        self.xpaths_to_scrape = light_side_xpaths
        self.csv_file = "lightside.csv"
    elif light_or_dark == "dark":
        self.xpaths_to_scrape = dark_side_xpaths
        self.csv_file = "darkside.csv"
    else:
        print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
        quit()
    self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    self.options = webdriver.ChromeOptions()
    #self.options.add_argument('--headless')
    self.options.add_argument('user-agent={self.user_agent}')
    self.current_region = ''
    self.driver = webdriver.Chrome(chrome_options=self.options)
    self.driver.get('https://craigslist.org')

    def run(self):
        self.navigate_pages()


def identify_phone_number(self, string, phone_number_list):
    reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
    if len(reg) > 0:
        for r in reg:
            if r.strip() not in phone_number_list:
                with open(self.csv_file, 'a') as csv:
                    csv.write("{}\n".format(r.strip()))
                print("Extracted {} from listing".format(r.strip()))
            else:
                print('Phone number already in list.')


def extract_phone_number(self):
    try:
        with open(self.csv_file, 'r') as csv:
            current_phone_numbers = csv.read()
        posting_body = self.driver.find_element_by_id('postingbody')
        self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
        contact_info = self.driver.find_element_by_class_name('showcontact')
        contact_info.click()
        time.sleep(1)
        self.identify_phone_number(posting_body.text, current_phone_numbers)
    except TimeoutException:
        self.identify_phone_number(posting_body.text, current_phone_numbers)
        print('There is no phone number in this listing.')



def scrape_pages(self):
    i=1
    while True:
        try:
            self.scraper_wait_class_until_all(self.driver, 'result-row')
            results = self.driver.find_elements_by_class_name('result-row')
            print("clicking result {}".format(i))
            results[i].find_element_by_class_name('result-title').click()
            self.extract_phone_number()
            self.driver.back()
            i+=1
        except IndexError:
            self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            print('Navigating to next page.')
            next_button.click()
            i=1

def choose_xpath_to_scrape(self, list_of_xpaths):
    xpath_index = randint(0, len(list_of_xpaths)-1)
    xpath = list_of_xpaths[xpath_index]
    return xpath
def navigate_pages(self):
    try:
        while True:
            try:
                self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
                rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
                nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
                child_items = nearby_cl.find_elements_by_class_name('s')
                random = randint(1, len(child_items)-1)
                time.sleep(3)
                print("Clicking {}".format(child_items[random].text))
                child_items[random].click()
                for xpath in self.xpaths_to_scrape:
                    area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
                    area_to_scrape.click()
                    self.scrape_pages()
                    self.driver.back()
                    time.sleep(1)
            except WebDriverException:
                continue
    except Exception as e:
        print(e)
        return
    finally:
        self.driver.quit()

和打开两个进程并初始化它们的main.py文件如下：

import scraper

from multiprocessing import Process, Manager



if __name__ == "__main__":
    manager = Manager()
    d = manager.dict()
    l = manager.list(range(10))
    darksideScraper = scraper.clBot('light')
    lightsideScraper = scraper.clBot('dark')

    darkside = Process(target=darksideScraper.navigate_pages())
    lightside = Process(target=lightsideScraper.navigate_pages())
    darkside.start()
    lightside.start()
    darkside.join()
    lightside.join()

任何帮助将不胜感激！

Answer 1

尝试将目标作为对函数的引用传递，而不是像Process(target=darksideScraper.navigate_pages)那样调用它。另请参阅this，了解如何使用多处理的另一个示例。

创建Selenium Scraper类的多个实例并运行并行

1 个答案: