Scrape隐藏框架JavaScript

时间:2019-02-19 11:00:52

标签: python selenium google-chrome

我正在尝试在隐藏框架内抓取数据;框架如下图所示

<!-- Content of the details tabs here  -->
    <div id="tabDetail_0" class="tab_content tab_detail" style="display: 
    block;"><iframe id="iframe_0" src="https://www.tmdn.org/tmview/get- 
    detail?st13=GB500000003342197" width="100%" height="600px;" 
    frameborder="0"></iframe></div></div></div> <!-- resultTabs -->

如您所见,HTML中有一个链接,我试图打开一个新的webdriver实例并浏览该链接并获取数据,但是它可以正常工作,但是该网站已停止,因为不允许直接访问这些链接。 / p>

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import traceback
import time

option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}



url ="https://www.tmdn.org/tmview/welcome#"
xlsName = 'D:\\test.xlsx' 
records = []

start_time = time.time()

driver = webdriver.Chrome(executable_path="D:\Python\chromedriver.exe",chrome_options=option)
driver.get(url)
time.sleep(10)
driver.find_element_by_xpath('//*[@id="buttonBox"]/a').click()
time.sleep(10)

x=-1

try:

    #click advanced search
    driver.find_element_by_name("lnkAdvancedSearch").click()
    #
    time.sleep(5)
    #to select Designated territories
    driver.find_element_by_id('DesignatedTerritories').click()
    time.sleep(5)   
    TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.optEUGroupContainer label')
    for elem in TerritoryLabelElements:
        if elem.text == 'United Kingdom':
            elem.click()
    time.sleep(5)
    driver.find_element_by_id('DesignatedTerritories').click()
    #
    time.sleep(5)
    #to select from Trade mark offices
    driver.find_element_by_id('SelectedOffices').click()
    time.sleep(5)   
    TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
    for elem in TerritoryLabelElements:
        if elem.text == 'GB United Kingdom ( UKIPO )':
            elem.click()
    time.sleep(5)
    driver.find_element_by_id('SelectedOffices').click()
    #Trade mark status
    driver.find_element_by_id('TradeMarkStatus').click()
    time.sleep(5)   
    TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
    for elem in TerritoryLabelElements:
        if elem.text == 'Filed':
            elem.click()
        if elem.text == 'Registered':
            elem.click()
    time.sleep(5)
    driver.find_element_by_id('TradeMarkStatus').click()
    # dates
    startdate = driver.find_element_by_id("ApplicationDateFrom")
    startdate.clear()
    startdate.send_keys ('01-10-2018')
    enddate = driver.find_element_by_id("ApplicationDateTo")
    enddate.clear()
    enddate.send_keys ('31-10-2018' )   
    # click search
    time.sleep(5)
    driver.find_element_by_id("SearchCopy").click()
    time.sleep(5)
    html= driver.page_source
    soup = BeautifulSoup(html,'html.parser')
    tbl = soup.find("table", id="grid") 


    driver.find_element_by_link_text('100').click()
    time.sleep(5)
    # #LOOP
    for i in range(1, 73):
        html= driver.page_source
        soup = BeautifulSoup(html,'html.parser')
        tbl = soup.find("table", id="grid")
        #extract data from table using soup
        tr_rows = tbl.find_all('tr')
        for tr_row in tr_rows[1:]:

            td_cells=tr_row.find_all('td')

            Trade_mark_name=td_cells[4].text
            Trade_mark_office=td_cells[5].text
            Designated_territory=td_cells[6].text
            Application_number=td_cells[7].text
            Registration_number=td_cells[8].text
            Trade_mark_status=td_cells[9].text
            Trade_mark_type=td_cells[13].text
            Applicant_name=td_cells[11].text
            Nice_class=td_cells[10].text
            Application_date=td_cells[12].text
            Registration_date=td_cells[14].text

            x=x+1
            #Click indiviual links
            el=driver.find_elements_by_class_name('cell_tmName_column')[x]

            action = webdriver.common.action_chains.ActionChains(driver)
            action.move_to_element_with_offset(el, 0, 0)
            action.click()
            action.perform()
            time.sleep(3)
            #switch to iframe of tab details
            iframe = driver.find_elements_by_tag_name('iframe')[0]

            driver.switch_to.frame(iframe)
            #get data from iframe 
            html2= driver.page_source                       
            soup2 = BeautifulSoup(html2,'html.parser')
            tblOwner = soup2.find("div", id="anchorOwner").find_next('table')
            tblRep = soup2.find("div", id="anchorRepresentative").find_next('table')
            # then switch back:
            driver.switch_to.default_content()

            try:
                Owner_Address= tblOwner.find("td", text="Address").find_next('td')
            except:
                Owner_Address='No Entry'

            try:
                Representative_Name=tblRep.find("td", text="Name").find_next('td').text.strip()
            except:
                Representative_Name='No Entry'

            records.append((Designated_territory,Applicant_name,Trade_mark_name,Application_date,Application_number,Trade_mark_type, Nice_class,Owner_Address,Trade_mark_office,    Registration_number,Trade_mark_status,Registration_date,Representative_Name))
            time.sleep(1)


            driver.find_elements_by_css_selector( 'a.close_tab')[0].click()

        #navigate next page_source
        driver.find_element_by_id('next_t_grid_toppager').click()
        time.sleep(2)
        x=-1
    #LOOP

    df = pd.DataFrame(records, columns=['Designated_territory','Applicant_name','Trade_mark_name','Application_date','Application_number','Trade_mark_type', 'Nice_class','Owner_Address','Trade_mark_office',  'Registration_number','Trade_mark_status','Registration_date','Representative_Name'])  
    df.to_excel(xlsName,sheet_name='sheet1', index=False, encoding='utf-8') 


except Exception:

    df = pd.DataFrame(records, columns=['Designated_territory','Applicant_name','Trade_mark_name','Application_date','Application_number','Trade_mark_type', 'Nice_class','Owner_Address','Trade_mark_office',  'Registration_number','Trade_mark_status','Registration_date','Representative_Name'])  
    df.to_excel(xlsName,sheet_name='sheet1', index=False, encoding='utf-8')

    traceback.print_exc()

time.sleep(5)
driver.quit()

1 个答案:

答案 0 :(得分:1)

您需要做的是switch_to.frame

iframe = driver.find_element_by_xpath('//iframe[@id="iframe_0"]')
driver.switch_to.frame(iframe)
# than switch back:
driver.switch_to.default_content()

编辑:

您已问过id是否会更改操作,这是一个想法,您可以像这样在xpath中使用contains

# this will find any iframe with and id of iframe_ 
# you should check there is only one, you can do so with: `iframes = driver.find_elements_by_xpath('//iframe[contains(@id,"iframe_")]')`
# than `print(len(iframes))` to see the amount of iframes 
iframe = driver.find_element_by_xpath('//iframe[contains(@id,"iframe_")]')
driver.switch_to.frame(iframe)
# than switch back:
driver.switch_to.default_content()

在您的代码中使用:

import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

url ="https://www.tmdn.org/tmview/welcome#"

driver = webdriver.Chrome(executable_path=r"D:\New Proj\chromedriver.exe")
driver.get(url)
time.sleep(3)
driver.find_element_by_xpath('//*[@id="buttonBox"]/a').click()
time.sleep(3)

#Click advanced search
driver.find_element_by_name("lnkAdvancedSearch").click()
#
time.sleep(5)
#to select Designated territories
driver.find_element_by_id('DesignatedTerritories').click()
time.sleep(5)   
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.optEUGroupContainer label')
for elem in TerritoryLabelElements:
    if elem.text == 'United Kingdom':
        elem.click()
time.sleep(5)
driver.find_element_by_id('DesignatedTerritories').click()
#
time.sleep(5)
#to select from Trade mark offices
driver.find_element_by_id('SelectedOffices').click()
time.sleep(5)   
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
    if elem.text == 'GB United Kingdom ( UKIPO )':
        elem.click()
time.sleep(5)
driver.find_element_by_id('SelectedOffices').click()
#Trade mark status
driver.find_element_by_id('TradeMarkStatus').click()
time.sleep(5)   
TerritoryLabelElements = driver.find_elements_by_css_selector( 'div.multiSelectOptions label')
for elem in TerritoryLabelElements:
    if elem.text == 'Filed':
        elem.click()
    if elem.text == 'Registered':
        elem.click()
time.sleep(5)
driver.find_element_by_id('TradeMarkStatus').click()
# dates
startdate = driver.find_element_by_id("ApplicationDateFrom")
startdate.clear()
startdate.send_keys ('10-01-2018')
enddate = driver.find_element_by_id("ApplicationDateTo")
enddate.clear()
enddate.send_keys ('10-01-2018' )   
# click search
time.sleep(5)
driver.find_element_by_id("SearchCopy").click()
time.sleep(30)

#Click first link
el=driver.find_elements_by_class_name('cell_tmName_column')[0]
action = ActionChains(driver)
action.move_to_element_with_offset(el, 0, 0)
action.click()
action.perform()
time.sleep(10)
iframe = driver.find_element_by_xpath('//iframe[@id="iframe_0"]')
driver.switch_to.frame(iframe)
# do something here I am printing the HTML
print(iframe.get_attribute('innerHTML'))
# than switch back:
driver.switch_to.default_content()

希望这对您有帮助!