如何使用python从在线pdf文件中获取数据?

时间:2019-05-15 03:25:04

标签: python pdf

我需要在在线pdf文件中获取金额的数据值。我需要20.707,00雷亚尔,这是“serviço”的总数

我正在使用硒来获得此PDF:

import os
import time
import xlrd  # importando a biblioteca
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import zipfile

inicio = time.time()

datainicial = "01042019"
datafinal = '30042019'

cnpj = '13177807000146'
senha = 'qualita@2018'


driver = webdriver.Chrome()
# driver.maximize_window()
driver.get("https://directa.natal.rn.gov.br/")
# Logando
driver.switch_to.frame(driver.find_element_by_name("mainsystem"))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "usuario"))).send_keys(cnpj)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "senha"))).send_keys(str(senha))
time.sleep(2)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.btn#acessar"))).click()
# Nota natalense
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "mainsystem")))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'limenu9'))).click()
time.sleep(1)
# Consulta
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#limenu9 > div > a:nth-child(1)"))).click()
# Consulta NFe
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#formsmenu12 > li:nth-child(4) > a'))).click()
# Trocando frame
time.sleep(1)
driver.switch_to.frame(0)
driver.switch_to.frame(0)
# Selecionando empresa
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[@id="lay"\]/div\[2\]/div\[2\]/div\[7\]/div\[2\]/div/div/table/tbody/tr/td'))).click()
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[@id="lookupInput"\]/option\[2\]'))).click()
#Marcando serviços prestados
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[@id="HTMLGroupBox628126"\]/table/tbody/tr\[2\]/td/table/tbody/tr/td\[1\]/a/img'))).click()
#Marcando retenção de ISS
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[@id="HTMLGroupBox628123"\]/table/tbody/tr\[1\]/td/table/tbody/tr/td\[1\]/a/img'))).click()
# selecionando a data inicial
driver.find_element_by_name('WFRInput628128').send_keys(datainicial)
# selecionando a data final
driver.find_element_by_name('WFRInput628127').send_keys(datafinal)
#Consultar
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[@id="lay"\]/div\[2\]/div\[2\]/div\[21\]/div/table/tbody/tr/td'))).click()
#Imprimir relatório
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[@id="lay"\]/div\[2\]/div\[3\]/div\[5\]/div/table/tbody/tr/td'))).click()
fim = time.time()
duracao = fim - inicio
print('O programa rodou em: {} e foram baixadas {} empresas'.format(duracao, i))

enter image description here

我可以用来从此PDF文件中仅获取此数据吗?

1 个答案:

答案 0 :(得分:0)

我会走更简单的路线:

使用诸如请求之类的任何库下载pdf文件,然后使用tika阅读pdf文件文本,然后使用正则表达式或任何其他方法搜索我的单词

# user.py

class Users:

   def __init__(self,window):      
      # Criando o sistema
      self.wind = window
      self.wind.title("System F2T")

   def createUser(self):
      from app import loginUser
      newUser = loginUser(self.wind)
      return newUser