UserService
上面的代码做了我想做的事,但是它只需要修改一部分即可,在该循环中,它将from selenium import webdriver
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
import itertools
import time
from time import gmtime, strftime
print("*" * 60)
print("MIPYTHON .com PYTHON PROXY SCRAPE and GO SCRIPT in SELENIUM")
print("*" * 60)
browser = webdriver.Chrome(executable_path='chromedriver.exe')
proxy_website = "https://free-proxy-list.net/"
proxy1 = []
proxy_port1= []
sachin = []
newlist = []
browser.get(proxy_website)
browser.find_element_by_xpath('/html/body/header/div[1]/div/div[1]/div/div/a[2]').click()
proxy = browser.find_elements_by_xpath("//tr[@role='row']/td[1]")
proxy_port = browser.find_elements_by_xpath("//tr[@role='row']/td[2]")
for element in proxy:
proxy1.append(element.text)
for element in proxy_port:
proxy_port1.append(element.text)
for i in range(len(proxy1)):
safale = proxy1[i]+":"+proxy_port1[i]
sachin.append(safale)
content = []
content = ['https://www.facebook.com/pg/virsaamusical/events/','https://www.facebook.com/pg/TheInnerUniverse.withVirsaaIndia/events/','https://www.facebook.com/pg/sadhguru/events/','https://www.facebook.com/pg/after8events/events/','https://www.facebook.com/pg/dovlinent/events/']
for ip,link in zip(itertools.cycle(sachin), content):
try:
path_to_chromedriver = 'C:/Users/Admin/AppData/Local/Programs/Python/Python37-32/chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
PROXY = ip
chrome_options.add_argument("start-maximized")
chrome_options.add_argument('--proxy-server=%s' % PROXY)
browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=path_to_chromedriver)
browser.get('https://www.facebook.com/')
browser.find_element_by_id('email').send_keys('USERID')
browser.find_element_by_id('pass').send_keys('PASSWORD')
browser.find_element_by_id('loginbutton').click()
browser.get(link)
mainclass = browser.find_elements_by_css_selector('._4dmd._4eok.uiGrid._51mz')
Month1 = []
Date1 = []
Event_link1 = []
Event_Name1 = []
Event_Time1 = []
No_of_Guests1 = []
Organizer_HomepageLink1 = []
Location1 = []
SCROLL_PAUSE_TIME = 2
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
browser.implicitly_wait(20)
Month = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[1]/span[1]/span[1]")
Date = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[1]/span[1]/span[2]")
Event_link = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[1]/a[1]")
Event_Name = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[1]/a[1]/span[1]")
Event_Time = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[2]/span[1]")
No_of_Guests = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[2]/div[1]/div[2]")
Organizer_HomepageLink = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[3]/div[1]/div[1]/a[1]")
Location = browser.find_elements_by_xpath("//tr[@class='_51mx']/td[3]/div[1]/div[2]")
try:
for element in Month:
Month1.append(element.text)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in Date:
Date1.append(element.text)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in Event_link:
sep = '?ref_page_id'
text = element.get_attribute('href')
rest = text.split(sep, 1)[0]
Event_link1.append(rest)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in Event_Name:
Lco = element.text
lcoa = Lco.replace(",", "|")
Event_Name1.append(lcoa)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in Event_Time:
Event_Time1.append(element.text)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in No_of_Guests:
Lco = element.text
lcoa = Lco.replace(",", "")
Lcoas = lcoa.replace(" . ", ",")
No_of_Guests1.append(Lcoas)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in Organizer_HomepageLink:
sep = '?ref_page_id'
text = element.get_attribute('href')
rest = text.split(sep, 1)[0]
Organizer_HomepageLink1.append(rest)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
try:
for element in Location:
Lco = element.text
lcoa = Lco.replace(",", "|")
Location1.append(lcoa)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
with open('E:\Events.csv', 'a') as f:
headers = ("Source Link,Month_Of_Event,Date-Of-Event,Event_Link,Event-Name,Time & No-of-Guests,Organizer-Link,Location, IP Used, Time Scraped")
f.write(headers)
f.write("\n")
try:
current = strftime("%Y-%m-%d %H:%M:%S", gmtime())
seperator = ':'
for i in range(len(Month1)):
f.write(link + "," + Month1[i] + "," + Date1[i] + "," + Event_link1[i] + "," + Event_Name1[i] + "," + No_of_Guests1[
i] + "," + Organizer_HomepageLink1[i] + "," + Location1[i] + "," + PROXY.split(seperator, 1)[0] + "," + current +"\n")
except IndexError:
pass
except NoSuchElementException:
pass
browser.close()
从sachin列表中获取IP,从内容中获取链接,如果代理ip很好,则一切正常,但是如果代理没有响应,它会抛出for ip,link in zip(itertools.cycle(sachin), content):
,这是我使用pass处理的,但是我真正想要实现的是,如果该IP没有响应,则应移至sachin列表中的下一个IP并使用停止时的相同链接重新启动。