迭代硒中的for循环以获取具有不同大小的列表

时间:2018-07-18 08:52:42

标签: python-3.x selenium-webdriver web-scraping

UserService

上面的代码做了我想做的事,但是它只需要修改一部分即可,在该循环中,它将from selenium import webdriver from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException from selenium.common.exceptions import NoSuchElementException import itertools import time from time import gmtime, strftime print("*" * 60) print("MIPYTHON .com PYTHON PROXY SCRAPE and GO SCRIPT in SELENIUM") print("*" * 60) browser = webdriver.Chrome(executable_path='chromedriver.exe') proxy_website = "https://free-proxy-list.net/" proxy1 = [] proxy_port1= [] sachin = [] newlist = [] browser.get(proxy_website) browser.find_element_by_xpath('/html/body/header/div[1]/div/div[1]/div/div/a[2]').click() proxy = browser.find_elements_by_xpath("//tr[@role='row']/td[1]") proxy_port = browser.find_elements_by_xpath("//tr[@role='row']/td[2]") for element in proxy: proxy1.append(element.text) for element in proxy_port: proxy_port1.append(element.text) for i in range(len(proxy1)): safale = proxy1[i]+":"+proxy_port1[i] sachin.append(safale) content = [] content = ['https://www.facebook.com/pg/virsaamusical/events/','https://www.facebook.com/pg/TheInnerUniverse.withVirsaaIndia/events/','https://www.facebook.com/pg/sadhguru/events/','https://www.facebook.com/pg/after8events/events/','https://www.facebook.com/pg/dovlinent/events/'] for ip,link in zip(itertools.cycle(sachin), content): try: path_to_chromedriver = 'C:/Users/Admin/AppData/Local/Programs/Python/Python37-32/chromedriver.exe' chrome_options = webdriver.ChromeOptions() prefs = {"profile.default_content_setting_values.notifications": 2} chrome_options.add_experimental_option("prefs", prefs) PROXY = ip chrome_options.add_argument("start-maximized") chrome_options.add_argument('--proxy-server=%s' % PROXY) browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=path_to_chromedriver) browser.get('https://www.facebook.com/') browser.find_element_by_id('email').send_keys('USERID') browser.find_element_by_id('pass').send_keys('PASSWORD') browser.find_element_by_id('loginbutton').click() browser.get(link) mainclass = browser.find_elements_by_css_selector('._4dmd._4eok.uiGrid._51mz') Month1 = [] Date1 = [] Event_link1 = [] Event_Name1 = [] Event_Time1 = [] No_of_Guests1 = [] Organizer_HomepageLink1 = [] Location1 = [] SCROLL_PAUSE_TIME = 2 # Get scroll height last_height = browser.execute_script("return document.body.scrollHeight") while True: # Scroll down to bottom browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = browser.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height browser.implicitly_wait(20) Month = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[1]/span[1]/span[1]") Date = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[1]/span[1]/span[2]") Event_link = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[1]/a[1]") Event_Name = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[1]/a[1]/span[1]") Event_Time = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[2]/span[1]") No_of_Guests = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[2]") Organizer_HomepageLink = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[3]/div[1]/div[1]/a[1]") Location = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[3]/div[1]/div[2]") try: for element in Month: Month1.append(element.text) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in Date: Date1.append(element.text) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in Event_link: sep = '?ref_page_id' text = element.get_attribute('href') rest = text.split(sep, 1)[0] Event_link1.append(rest) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in Event_Name: Lco = element.text lcoa = Lco.replace(",", "|") Event_Name1.append(lcoa) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in Event_Time: Event_Time1.append(element.text) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in No_of_Guests: Lco = element.text lcoa = Lco.replace(",", "") Lcoas = lcoa.replace(" . ", ",") No_of_Guests1.append(Lcoas) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in Organizer_HomepageLink: sep = '?ref_page_id' text = element.get_attribute('href') rest = text.split(sep, 1)[0] Organizer_HomepageLink1.append(rest) except StaleElementReferenceException: pass except NoSuchElementException: pass try: for element in Location: Lco = element.text lcoa = Lco.replace(",", "|") Location1.append(lcoa) except StaleElementReferenceException: pass except NoSuchElementException: pass with open('E:\Events.csv', 'a') as f: headers = ("Source Link,Month_Of_Event,Date-Of-Event,Event_Link,Event-Name,Time & No-of-Guests,Organizer-Link,Location, IP Used, Time Scraped") f.write(headers) f.write("\n") try: current = strftime("%Y-%m-%d %H:%M:%S", gmtime()) seperator = ':' for i in range(len(Month1)): f.write(link + "," + Month1[i] + "," + Date1[i] + "," + Event_link1[i] + "," + Event_Name1[i] + "," + No_of_Guests1[ i] + "," + Organizer_HomepageLink1[i] + "," + Location1[i] + "," + PROXY.split(seperator, 1)[0] + "," + current +"\n") except IndexError: pass except NoSuchElementException: pass browser.close() 从sachin列表中获取IP,从内容中获取链接,如果代理ip很好,则一切正常,但是如果代理没有响应,它会抛出for ip,link in zip(itertools.cycle(sachin), content):,这是我使用pass处理的,但是我真正想要实现的是,如果该IP没有响应,则应移至sachin列表中的下一个IP并使用停止时的相同链接重新启动。

0 个答案:

没有答案