如何在Python中使用Selenium打印打开的pdf链接?

时间:2019-06-03 06:22:53

标签: python selenium web-scraping beautifulsoup selenium-chromedriver

我无法打印运行给定代码后打开的最终pdf的链接

from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException 

def page_is_loaded(driver):
    return driver.find_element_by_tag_name("body")!= None


def check_exists_by_text(text):
    try:
        driver.find_element_by_link_text(text)
    except NoSuchElementException:
        return False
    return True

driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")

wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)

location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")

last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")

driver.find_element_by_xpath("//input[@name='btn_submit']").click()

if check_exists_by_text('Reliance Capital Limited'):
    elm =driver.find_element_by_link_text('Reliance Capital Limited')
    driver.implicitly_wait(5)
    elm.click()
    driver.implicitly_wait(50)
    #time.sleep(5)
    #driver.quit()
else :
    print("Company is not rated in the given Date range")

我希望实际输出是此pdf的链接:

http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf

但我不知道如何打印此链接

3 个答案:

答案 0 :(得分:0)

您需要找到表中的所有元素,然后从它们中提取数据。

from selenium import webdriver
import os

# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')

# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[@name='btn_submit']").click()

# get all data rows
content = browser.find_elements_by_xpath('//*[@id="divManagementSpeak"]/table/tbody/tr/td/a')

# get text and href link from each element
collected_data = []
for item in content:
    url = item.get_attribute("href")
    description = item.get_attribute("innerText")
    collected_data.append((url, description ))

输出:

('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage') 

以此类推

答案 1 :(得分:0)

我要说的是,您只需要把这一行:

pdf_link = elm.get_attribute("href")

答案 2 :(得分:0)

只需检查下图即可。您错过了单击的重要部分。当您在该输入框中输入一些文本时,将向下投影一个下拉菜单,显示其库存中可供选择的搜索结果。单击该按钮后,其余部分保持不变。

enter image description here

尝试以下脚本:

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

url = "http://www.careratings.com/brief-rationale.aspx"

with webdriver.Chrome() as driver:
    driver.get(url)
    wait = WebDriverWait(driver,10)

    location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
    location_field.send_keys("2019-05-06")

    last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
    last_date.send_keys("2019-05-21")

    input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
    input_search.send_keys('Reliance Capital Limited')

    time.sleep(3) #could not get rid of this hardcoded delay to make the script work

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()

    # time.sleep(2) #activate this line in case the script behaves otherwise

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
    for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
        print(item.get_attribute("href"))