Question

我正在使用Selenium和Python抓取网站。我无法使用Beautiful Soup抓取特定表。这是代码

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import pandas as pd
from bs4 import BeautifulSoup

from selenium.webdriver.support import expected_conditions
link='http://omms.nic.in/#'
browser=webdriver.Firefox() 
browser.get(link)
time.sleep(15)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/a'))).click()
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/ul/li[1]/ul/li[5]/a'))).click()
select_state=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[3]/select'))
select_state.select_by_index(35)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[3]/td[7]/input[1]'))).click()
select_district=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[5]/select'))
options = [x.text for x in select_district.options]
select_district.select_by_index(3)
select_year=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[2]/td[3]/select'))
select_year.select_by_index(8)
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[4]/td[3]/input'))).click()
time.sleep(5)



soup=BeautifulSoup(browser.page_source,"html.parser")
table=soup.find_all('table',attrs={'class':"A35402edea1d24691942da96210fa88a3382"})
data_name = pd.read_html(str(table))[0]

我收到错误消息，因为没有找到表

Answer 1

这可能是因为您要查找的表不在browser.page_source下。它是从单独的iframe加载的。我们可以切换到框架，然后获取源。

browser.switch_to.frame(driver.find_element_by_xpath("//*[@id='loadReport']//iframe"))
print(browser.page_source)
soup = BeautifulSoup(browser.page_source,"html.parser")

如果您需要再次与页面互动，可以使用

切换回

browser.switch_to.default_content()

还考虑增加等待表加载的时间。

无法刮擦特定的桌子

1 个答案: