我是python的新手,我正尝试从具有多个页面的网站中抓取一个表。我应该如何尝试使代码使用.click()
以及应将代码放置在何处以获取表的动态抓取。
我正在尝试的网站是https://free-proxy-list.net/,并且能够从第一页获得表格。我正在尝试获取所有页面并将其放入熊猫数据框。我已经将表中的信息放入字典中,并尝试将dict放入数据框内。但是,仅第一页可以插入数据框。我也需要其他页面上的所有数据
答案 0 :(得分:1)
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
driver=webdriver.Chrome()
driver.get('https://free-proxy-list.net/')
page=1
max_page=15
IP=[]
Port=[]
Code=[]
Country=[]
Anonymity=[]
Google=[]
Https=[]
LastCheck=[]
while page<=max_page:
rows= WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='proxylisttable']/tbody//tr")))
for row in rows:
IP.append(row.find_element_by_xpath('./td[1]').text)
Port.append(row.find_element_by_xpath('./td[2]').text)
Code.append(row.find_element_by_xpath('./td[3]').text)
Country.append(row.find_element_by_xpath('./td[4]').get_attribute('textContent'))
Anonymity.append(row.find_element_by_xpath('./td[5]').text)
Google.append(row.find_element_by_xpath('./td[6]').get_attribute('textContent'))
Https.append(row.find_element_by_xpath('./td[7]').text)
LastCheck.append(row.find_element_by_xpath('./td[8]').get_attribute('textContent'))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-controls='proxylisttable' and text()='Next']"))).click()
page=page+1
print('navigate to page: ' + str(page))
driver.close()
df=pd.DataFrame({"IP":IP,"Port":Port,"Code":Code,"Country":Country,"Anonymity":Anonymity,"Google":Google,"Https":Https,"Last_Checked":LastCheck})
print(df)
df.to_csv('output_IP.csv',index=False)