如何使用熊猫从网站上读取html来抓取表格中的表格

时间:2019-07-16 10:52:05

标签: python-3.x selenium-webdriver beautifulsoup

当我解析表[3]时,它包含子表。完成后,它将重复打印子表。我尝试使用熊猫库read_html。

enter code here
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'

 driver = webdriver.Chrome(executable_path=chrome_path)
 driver.get(url)
 WebDriverWait(driver, 
    20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search- 
    pro-details']//a[contains(.,'Search Project Details')]"))).click()
 Registered_Project_radio= WebDriverWait(driver, 
    10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
 driver.execute_script("arguments[0].click();",Registered_Project_radio)
 Application = driver.find_element_by_id("CertiNo")
 Application.send_keys("P50500000005")
 Search = WebDriverWait(driver, 
     10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
 driver.execute_script("arguments[0].click();",Search)
 View = [item.get_attribute('href') for item in 
     driver.find_elements_by_tag_name("a") if
     item.get_attribute('href') is not None]
 View = View[0]
 driver.get(View)
 table = pd.read_html(driver.page_source)[3]
 print(table)
 with open("MyFile_table.csv","a") as f :
     table.to_csv(f , sep=',',index = False)
     f.close()

0 个答案:

没有答案