我正在尝试从PDF页面获取文本,因为我将使用XPATH selenium IDE和python逐个点击pdf页面链接但是它给了我空数据,有时它给了我一页的PDF内容页面,但不是特定格式。
如何从pdf链接的所有页面获取文字?
以下是我的代码:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.incredibleindia.org"
driver = webdriver.Firefox()
driver.get(url)
# wait for menu to being loaded
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.menu li > a")))
#article under media tab
article_link = [a.get_attribute('href') for a in driver.find_elements_by_xpath("html/body/div[3]/div/div[1]/div[2]/ul/li[3]/ul/li[6]/a")]
#all important news links under trade tab
for link in article_link:
print link
driver.get(link)
#check article sublinks css available on article link page
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.article-full-div")))
except TimeoutException:
print driver.title, "No news links under media tab"
#alrticle sub links under article tab
article_sub_links = [a.get_attribute('href') for a in driver.find_elements_by_xpath(".//*[@id='article-content']/div/div[2]/ul/li/a")]
print "article sub links"
for link in article_sub_links:
print link
driver.get(link)
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.textLayer")))
except TimeoutException:
print driver.title, "No news links under media tab"
content = [a.text for a in driver.find_elements_by_xpath(".//*[contains(@id,'pageContainer')]")]
print content
for data in content:
print data
输出:
http://www.incredibleindia.org/en/media-black-2/articles
article sub links
http://www.incredibleindia.org/images/articles/Ajanta.pdf
[u'', u'', u'']
http://www.incredibleindia.org/images/articles/Bedhaghat.pdf
404 - Error: 404 No news links under media tab`
[]
http://www.incredibleindia.org/images/articles/Bellur.pdf
[u'', u'', u'']
http://www.incredibleindia.org/images/articles/Bidar.pdf
[u'', u'', u'']
http://www.incredibleindia.org/images/articles/Braj.pdf
[u'', u'', u'', u'']
http://www.incredibleindia.org/images/articles/Carnival.pdf
[u'', u'', u'']`
答案 0 :(得分:1)
我认为你需要深入了解" textlayer" (div
元素(每个页面容器内有class="textlayer"
)。您还需要在异常处理块中使用continue
:
for link in article_sub_links:
driver.get(link)
try:
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.textLayer")))
except TimeoutException:
print driver.title, "Empty content"
continue
content = [a.text for a in driver.find_elements_by_css_selector("div[id^=pageContainer] div.textLayer")]
for data in content:
print data