我正在开发代码以进入javascript环境,然后我想使用BeautifulSoup从网站上抓取数据。关键是我意识到环境中没有任何表格,所以我想知道如何从网站上抓取数据。
有什么提示吗?
这是为了完成我的课程,我想知道是否有表格,但试图抓取数据,但失败了。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
from tabulate import tabulate
import os
url = "https://scon.stj.jus.br/SCON/legaplic/toc.jsp?materia=%27Lei+8.429%2F1992+%28Lei+DE+IMPROBIDADE+ADMINISTRATIVA%29%27.mat.&b=TEMA&p=true&t=&l=1&i=18&ordem=MAT,@NUM"
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)
python_button = driver.find_element_by_xpath('/html/body/div[2]/div[6]/div/div/div[3]/div[2]/div/div/div/div[16]/a')
python_button.click()
driver.switch_to.window(driver.window_handles[-1])
python_button = driver.find_element_by_xpath('/html/body/div[2]/div[6]/div[1]/div/div[3]/div[2]/div/div/div/div[3]/div[2]/span[2]/a')
python_button.click()
driver.switch_to.window(driver.window_handles[-1])
pagina_de_resultados = BeautifulSoup(driver.page_source, 'lxml')
table = pagina_de_resultados.find_all('table')
df = pd.read_html(str(table), header=0)
datalist.append(df[0])
x += 1
driver.quit()
result = pd.concat([pd.DataFrame(datalist[i]) for i in range(len(datalist))], ignore_index=True)
json_records = result.to_json(orient='records')
print(tabulate(result, headers=["Processo", "Relator(a)", "Órgão Julgador", "Data do Julgamento", "Data da Publicação/Fonte", "Ementa", "Notas", "Informações Complementares à Ementa (ICE)", "Referência Legislativa", "Jurisprudência Citada", "Acórdãos Similares"], tablefmt='psql'))
path = os.getcwd()
f = open(path + "\\fhsu_payroll_data.json", "w")
f.write(json_records)
f.close()
我希望输出是带有判断信息的表,但是我却得到了一个错误,即在环境中找不到表。
答案 0 :(得分:0)
您将必须使用与所需数据关联的标签,并将其转储到数据框中。这是第一页/链接:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
from tabulate import tabulate
import os
url = "https://scon.stj.jus.br/SCON/legaplic/toc.jsp?materia=%27Lei+8.429%2F1992+%28Lei+DE+IMPROBIDADE+ADMINISTRATIVA%29%27.mat.&b=TEMA&p=true&t=&l=1&i=18&ordem=MAT,@NUM"
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
python_button = driver.find_element_by_xpath('/html/body/div[2]/div[6]/div/div/div[3]/div[2]/div/div/div/div[16]/a')
python_button.click()
driver.switch_to.window(driver.window_handles[-1])
python_button = driver.find_element_by_xpath('/html/body/div[2]/div[6]/div[1]/div/div[3]/div[2]/div/div/div/div[3]/div[2]/span[2]/a')
python_button.click()
driver.switch_to.window(driver.window_handles[-1])
pagina_de_resultados = BeautifulSoup(driver.page_source, 'lxml')
parse = pagina_de_resultados.find('div', {'id':'listadocumentos'})
paragrafoBRS = parse.find_all('div',{'class':'paragrafoBRS'})
results = pd.DataFrame()
for each in paragrafoBRS:
if each.find('h4', {'class':'docTitulo'}).text.strip() == 'Processo':
header = []
content = []
header.append(each.find('h4', {'class':'docTitulo'}).text.strip())
content.append(each.find(['div','pre'], {'class':'docTexto'}).text.strip())
df = pd.DataFrame([content], columns = header)
results = results.append(df).reset_index(drop=True)