所以我创建了一个带有selenium的web scraper,它可以无限地抓取一个网页。我正在尝试创建这个刮刀的两个实例并将它们并行运行,以便同时刮取网站的两个不同部分(或完全两个不同的网站)。使用我当前的代码,两个进程都启动,两个chrome实例启动,但只有一个实际开始抓取。另一个只是坐在着陆页上,永远不会移动。我目前的刮刀类看起来像这样
class clBot(Scraper):
def __init__(self, light_or_dark):
light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
'//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
if light_or_dark == "light":
self.xpaths_to_scrape = light_side_xpaths
self.csv_file = "lightside.csv"
elif light_or_dark == "dark":
self.xpaths_to_scrape = dark_side_xpaths
self.csv_file = "darkside.csv"
else:
print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
quit()
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
#self.options.add_argument('--headless')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.get('https://craigslist.org')
def run(self):
self.navigate_pages()
def identify_phone_number(self, string, phone_number_list):
reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
if len(reg) > 0:
for r in reg:
if r.strip() not in phone_number_list:
with open(self.csv_file, 'a') as csv:
csv.write("{}\n".format(r.strip()))
print("Extracted {} from listing".format(r.strip()))
else:
print('Phone number already in list.')
def extract_phone_number(self):
try:
with open(self.csv_file, 'r') as csv:
current_phone_numbers = csv.read()
posting_body = self.driver.find_element_by_id('postingbody')
self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
contact_info = self.driver.find_element_by_class_name('showcontact')
contact_info.click()
time.sleep(1)
self.identify_phone_number(posting_body.text, current_phone_numbers)
except TimeoutException:
self.identify_phone_number(posting_body.text, current_phone_numbers)
print('There is no phone number in this listing.')
def scrape_pages(self):
i=1
while True:
try:
self.scraper_wait_class_until_all(self.driver, 'result-row')
results = self.driver.find_elements_by_class_name('result-row')
print("clicking result {}".format(i))
results[i].find_element_by_class_name('result-title').click()
self.extract_phone_number()
self.driver.back()
i+=1
except IndexError:
self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
print('Navigating to next page.')
next_button.click()
i=1
def choose_xpath_to_scrape(self, list_of_xpaths):
xpath_index = randint(0, len(list_of_xpaths)-1)
xpath = list_of_xpaths[xpath_index]
return xpath
def navigate_pages(self):
try:
while True:
try:
self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items)-1)
time.sleep(3)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
for xpath in self.xpaths_to_scrape:
area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
area_to_scrape.click()
self.scrape_pages()
self.driver.back()
time.sleep(1)
except WebDriverException:
continue
except Exception as e:
print(e)
return
finally:
self.driver.quit()
和打开两个进程并初始化它们的main.py文件如下:
import scraper
from multiprocessing import Process, Manager
if __name__ == "__main__":
manager = Manager()
d = manager.dict()
l = manager.list(range(10))
darksideScraper = scraper.clBot('light')
lightsideScraper = scraper.clBot('dark')
darkside = Process(target=darksideScraper.navigate_pages())
lightside = Process(target=lightsideScraper.navigate_pages())
darkside.start()
lightside.start()
darkside.join()
lightside.join()
任何帮助将不胜感激!
答案 0 :(得分:1)
尝试将目标作为对函数的引用传递,而不是像Process(target=darksideScraper.navigate_pages)
那样调用它。另请参阅this,了解如何使用多处理的另一个示例。