我正在尝试在隐藏框架内抓取数据;框架如下图所示
<!-- Content of the details tabs here -->
<div id="tabDetail_0" class="tab_content tab_detail" style="display:
block;"><iframe id="iframe_0" src="https://www.tmdn.org/tmview/get-
detail?st13=GB500000003342197" width="100%" height="600px;"
frameborder="0"></iframe></div></div></div> <!-- resultTabs -->
如您所见,HTML中有一个链接,我试图打开一个新的webdriver实例并浏览该链接并获取数据,但是它可以正常工作,但是该网站已停止,因为不允许直接访问这些链接。 / p>
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import traceback
import time
option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}
url ="https://www.tmdn.org/tmview/welcome#"
xlsName = 'D:\\test.xlsx'
records = []
start_time = time.time()
driver = webdriver.Chrome(executable_path="D:\Python\chromedriver.exe",chrome_options=option)
driver.get(url)
time.sleep(10)
driver.find_element_by_xpath('//*[@id="buttonBox"]/a').click()
time.sleep(10)
x=-1
try:
#click advanced search
driver.find_element_by_name("lnkAdvancedSearch").click()
#
time.sleep(5)
#to select Designated territories
driver.find_element_by_id('DesignatedTerritories').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.optEUGroupContainer label')
for elem in TerritoryLabelElements:
if elem.text == 'United Kingdom':
elem.click()
time.sleep(5)
driver.find_element_by_id('DesignatedTerritories').click()
#
time.sleep(5)
#to select from Trade mark offices
driver.find_element_by_id('SelectedOffices').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'GB United Kingdom ( UKIPO )':
elem.click()
time.sleep(5)
driver.find_element_by_id('SelectedOffices').click()
#Trade mark status
driver.find_element_by_id('TradeMarkStatus').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'Filed':
elem.click()
if elem.text == 'Registered':
elem.click()
time.sleep(5)
driver.find_element_by_id('TradeMarkStatus').click()
# dates
startdate = driver.find_element_by_id("ApplicationDateFrom")
startdate.clear()
startdate.send_keys ('01-10-2018')
enddate = driver.find_element_by_id("ApplicationDateTo")
enddate.clear()
enddate.send_keys ('31-10-2018' )
# click search
time.sleep(5)
driver.find_element_by_id("SearchCopy").click()
time.sleep(5)
html= driver.page_source
soup = BeautifulSoup(html,'html.parser')
tbl = soup.find("table", id="grid")
driver.find_element_by_link_text('100').click()
time.sleep(5)
# #LOOP
for i in range(1, 73):
html= driver.page_source
soup = BeautifulSoup(html,'html.parser')
tbl = soup.find("table", id="grid")
#extract data from table using soup
tr_rows = tbl.find_all('tr')
for tr_row in tr_rows[1:]:
td_cells=tr_row.find_all('td')
Trade_mark_name=td_cells[4].text
Trade_mark_office=td_cells[5].text
Designated_territory=td_cells[6].text
Application_number=td_cells[7].text
Registration_number=td_cells[8].text
Trade_mark_status=td_cells[9].text
Trade_mark_type=td_cells[13].text
Applicant_name=td_cells[11].text
Nice_class=td_cells[10].text
Application_date=td_cells[12].text
Registration_date=td_cells[14].text
x=x+1
#Click indiviual links
el=driver.find_elements_by_class_name('cell_tmName_column')[x]
action = webdriver.common.action_chains.ActionChains(driver)
action.move_to_element_with_offset(el, 0, 0)
action.click()
action.perform()
time.sleep(3)
#switch to iframe of tab details
iframe = driver.find_elements_by_tag_name('iframe')[0]
driver.switch_to.frame(iframe)
#get data from iframe
html2= driver.page_source
soup2 = BeautifulSoup(html2,'html.parser')
tblOwner = soup2.find("div", id="anchorOwner").find_next('table')
tblRep = soup2.find("div", id="anchorRepresentative").find_next('table')
# then switch back:
driver.switch_to.default_content()
try:
Owner_Address= tblOwner.find("td", text="Address").find_next('td')
except:
Owner_Address='No Entry'
try:
Representative_Name=tblRep.find("td", text="Name").find_next('td').text.strip()
except:
Representative_Name='No Entry'
records.append((Designated_territory,Applicant_name,Trade_mark_name,Application_date,Application_number,Trade_mark_type, Nice_class,Owner_Address,Trade_mark_office, Registration_number,Trade_mark_status,Registration_date,Representative_Name))
time.sleep(1)
driver.find_elements_by_css_selector( 'a.close_tab')[0].click()
#navigate next page_source
driver.find_element_by_id('next_t_grid_toppager').click()
time.sleep(2)
x=-1
#LOOP
df = pd.DataFrame(records, columns=['Designated_territory','Applicant_name','Trade_mark_name','Application_date','Application_number','Trade_mark_type', 'Nice_class','Owner_Address','Trade_mark_office', 'Registration_number','Trade_mark_status','Registration_date','Representative_Name'])
df.to_excel(xlsName,sheet_name='sheet1', index=False, encoding='utf-8')
except Exception:
df = pd.DataFrame(records, columns=['Designated_territory','Applicant_name','Trade_mark_name','Application_date','Application_number','Trade_mark_type', 'Nice_class','Owner_Address','Trade_mark_office', 'Registration_number','Trade_mark_status','Registration_date','Representative_Name'])
df.to_excel(xlsName,sheet_name='sheet1', index=False, encoding='utf-8')
traceback.print_exc()
time.sleep(5)
driver.quit()
答案 0 :(得分:1)
您需要做的是switch_to.frame
:
iframe = driver.find_element_by_xpath('//iframe[@id="iframe_0"]')
driver.switch_to.frame(iframe)
# than switch back:
driver.switch_to.default_content()
编辑:
您已问过id是否会更改操作,这是一个想法,您可以像这样在xpath中使用contains
:
# this will find any iframe with and id of iframe_
# you should check there is only one, you can do so with: `iframes = driver.find_elements_by_xpath('//iframe[contains(@id,"iframe_")]')`
# than `print(len(iframes))` to see the amount of iframes
iframe = driver.find_element_by_xpath('//iframe[contains(@id,"iframe_")]')
driver.switch_to.frame(iframe)
# than switch back:
driver.switch_to.default_content()
在您的代码中使用:
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
url ="https://www.tmdn.org/tmview/welcome#"
driver = webdriver.Chrome(executable_path=r"D:\New Proj\chromedriver.exe")
driver.get(url)
time.sleep(3)
driver.find_element_by_xpath('//*[@id="buttonBox"]/a').click()
time.sleep(3)
#Click advanced search
driver.find_element_by_name("lnkAdvancedSearch").click()
#
time.sleep(5)
#to select Designated territories
driver.find_element_by_id('DesignatedTerritories').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.optEUGroupContainer label')
for elem in TerritoryLabelElements:
if elem.text == 'United Kingdom':
elem.click()
time.sleep(5)
driver.find_element_by_id('DesignatedTerritories').click()
#
time.sleep(5)
#to select from Trade mark offices
driver.find_element_by_id('SelectedOffices').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'GB United Kingdom ( UKIPO )':
elem.click()
time.sleep(5)
driver.find_element_by_id('SelectedOffices').click()
#Trade mark status
driver.find_element_by_id('TradeMarkStatus').click()
time.sleep(5)
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
if elem.text == 'Filed':
elem.click()
if elem.text == 'Registered':
elem.click()
time.sleep(5)
driver.find_element_by_id('TradeMarkStatus').click()
# dates
startdate = driver.find_element_by_id("ApplicationDateFrom")
startdate.clear()
startdate.send_keys ('10-01-2018')
enddate = driver.find_element_by_id("ApplicationDateTo")
enddate.clear()
enddate.send_keys ('10-01-2018' )
# click search
time.sleep(5)
driver.find_element_by_id("SearchCopy").click()
time.sleep(30)
#Click first link
el=driver.find_elements_by_class_name('cell_tmName_column')[0]
action = ActionChains(driver)
action.move_to_element_with_offset(el, 0, 0)
action.click()
action.perform()
time.sleep(10)
iframe = driver.find_element_by_xpath('//iframe[@id="iframe_0"]')
driver.switch_to.frame(iframe)
# do something here I am printing the HTML
print(iframe.get_attribute('innerHTML'))
# than switch back:
driver.switch_to.default_content()
希望这对您有帮助!