我在尝试使用 puppeteer 和 tabletojson 将html表转换为json时遇到了一些问题。
我正在做的事情非常简单,puppeteer进入chrome,创建了一个新页面,然后开始输入一些单词并点击了搜索按钮。当操纵up完成此任务时,屏幕上会出现一个表格,这就是为什么 tabletojson 不断向我显示一个空数组的原因。它不是与 puppeteer 一起运行,而是在 puppeteer 访问相同的URL之后运行,而没有任何以前的 puppeteer 交互,该交互没有任何表可读取
我要问的是:有没有办法在运行操纵up时将此表作为json获取?还是有某种方法可以在抓取时异步运行tabletojson?
const puppteer = require('puppeteer')
const tabletojson = require('tabletojson')
async function letscrap()
{
some puppeteer ....
const browser = await puppteer.launch(
{
headless: false,
defaultViewport: null
}
)
const page = await browser.newPage()
const url = "https://someurl.com/..."
some tabletojson ....
// here the ideia is to get json from tables...
// at this point what happens is that it doesn't return anything due to this...
await tabletojson.convertUrl(
'https://someurl.com/...',
{stripHtmlFromCells: false, stripHtmlFromCells: true },
function(tablesAsJson) {
console.log(tablesAsJson);
}
);
}
letscrap()
HTML来源:
<form method="post" action="/ConsultaPlanosConsumidor/pages/home.xhtml;jsessionid=bSbNLIiZ7pWWf7mgDl_MIw-F9QPRpVbNlo8johjO.ansprjboss01a:consulta-planos-consumidor-01a" enctype="application/x-www-form-urlencoded">
<!--
Some inputs right here
and...
-->
<button id="formHome:tabOperadora:j_idt99" name="formHome:tabOperadora:j_idt99" class="ui-button ui-widget ui-state-default ui-corner-all ui-button-text-only" onclick="" type="submit" role="button" aria-disabled="false">
<span class="ui-button-text ui-c">Search</span>
</button>
<!--
When you hit "Search" it "appends" to it's div with a table.
-->
</form>
谢谢您的关注!
答案 0 :(得分:0)
我认为您要收集的桌子不合适。
我发现结果页中至少有5个表。
因此,您必须为包含的数据表找到正确的选择器。
右边的选择器是table[role="grid"]
。而且,您可以不使用tabletojson刮取搜索结果中的所有页面。我编写了此脚本,将搜索结果表数据写入CSV文件。
const puppeteer = require ('puppeteer')
const fs = require ('fs-extra')
const selectElementID_first = 'formHome:tabOperadora:solbSituacoesPrincipais'
const selectElementID_second = 'formHome:tabOperadora:solbAbrangenciasGeograficas'
const selectedOption_first = 'Liberada'
const selectedOption_second = 'Nacional'
const saveFileCSV = 'porpiano.csv'
let lastPage = 0
;(async () => {
const browser = await puppeteer.launch ({
headless : false,
devtools : false
})
const [page] = await browser.pages ()
const open = await page.goto ('http://www.ans.gov.br/ConsultaPlanosConsumidor/', { waitUntil: 'networkidle0', timeout: 0 })
const porPianoClick = await page.evaluate ( () => document.querySelector('a[href="#formHome:tabOperadora:panelPorPlano"]').click() )
// FIRST SELECT OPTION
while ( !await page.evaluate( selectElementID_first => document.querySelector(`div[id="${selectElementID_first}"]`).classList.contains('ui-state-focus'), selectElementID_first ) ) {
await page.keyboard.press('Tab')
await page.waitFor(300)
}
await page.keyboard.press('Space')
while ( await page.evaluate( selectElementID_first => document.querySelector(`div[id="${selectElementID_first}_panel"]`).style.display === 'none', selectElementID_first ) ) {
await page.waitFor(300)
}
while ( await page.evaluate( (selectElementID_first, selectedOption_first) => document.querySelector(`div[id="${selectElementID_first}_panel"] ul > li.ui-state-highlight`).innerText !== selectedOption_first, selectElementID_first, selectedOption_first ) ) {
await page.keyboard.press('ArrowDown')
await page.waitFor(300)
}
await page.keyboard.press('Enter')
await page.waitFor(1000)
// SECOND SELECT OPTION
while ( !await page.evaluate( selectElementID_second => document.querySelector(`div[id="${selectElementID_second}"]`).classList.contains('ui-state-focus'), selectElementID_second ) ) {
await page.keyboard.press('Tab')
await page.waitFor(300)
}
await page.keyboard.press('Space')
while ( await page.evaluate( selectElementID_second => document.querySelector(`div[id="${selectElementID_second}_panel"]`).style.display === 'none', selectElementID_second ) ) {
await page.waitFor(300)
}
while ( await page.evaluate( (selectElementID_second, selectedOption_second) => document.querySelector(`div[id="${selectElementID_second}_panel"] ul > li.ui-state-highlight`).innerText !== selectedOption_second, selectElementID_second, selectedOption_second ) ) {
await page.keyboard.press('ArrowDown')
await page.waitFor(300)
}
await page.keyboard.press('Enter')
await page.waitFor(1000)
await page.evaluate( () => document.querySelector('div[id="formHome:tabOperadora:panelPorPlano"] button[type="submit"]').click() )
await fs.writeFile ( saveFileCSV, 'Número do Registro / Código do Plano, Nome Comercial do Plano, Segmentação Assistencial, Tipo de Contratação, Abrangência Geográfica, Tipo de Plano, Comercialização\n' )
const loadTableGrid = async () => {
// WAIT FOR PAGE LOADING
await page.waitForSelector('span.ui-paginator-current', {timeout: 0})
while ( lastPage === await page.evaluate ( () => document.querySelector('span.ui-paginator-current').innerText.split(' de ')[0] ) ) {
await page.waitFor(250)
}
await page.waitForSelector('table[role="grid"]', {timeout: 0})
// SCRAPE DATA
var tableDataPage = await page.evaluate( () => {
var tableDataPage = []
var tableDataColumn1 = []
var tableDataColumn2 = []
var tableDataColumn3 = []
var tableDataColumn4 = []
var tableDataColumn5 = []
var tableDataColumn6 = []
var tableDataColumn7 = []
document.querySelectorAll('td[role="gridcell"]:nth-of-type(1)').forEach( tableData => tableDataColumn1.push( tableData.innerText ) )
document.querySelectorAll('td[role="gridcell"]:nth-of-type(2)').forEach( tableData => tableDataColumn2.push( `"${tableData.innerText}"` ) )
document.querySelectorAll('td[role="gridcell"]:nth-of-type(3)').forEach( tableData => tableDataColumn3.push( tableData.innerText ) )
document.querySelectorAll('td[role="gridcell"]:nth-of-type(4)').forEach( tableData => tableDataColumn4.push( tableData.innerText ) )
document.querySelectorAll('td[role="gridcell"]:nth-of-type(5)').forEach( tableData => tableDataColumn5.push( tableData.innerText ) )
document.querySelectorAll('td[role="gridcell"]:nth-of-type(6)').forEach( tableData => tableDataColumn6.push( tableData.innerText ) )
document.querySelectorAll('td[role="gridcell"]:nth-of-type(7)').forEach( tableData => tableDataColumn7.push( tableData.innerText ) )
for ( let num in tableDataColumn1 ) {
tableDataPage[num] =
[
tableDataColumn1[num],
tableDataColumn2[num],
tableDataColumn3[num],
tableDataColumn4[num],
tableDataColumn5[num],
tableDataColumn6[num],
tableDataColumn7[num]
]
}
return tableDataPage
})
// WRITE COLLECTED DATA TO CSV FORMAT
for ( let row in tableDataPage ) {
await fs.appendFile ( saveFileCSV, `${tableDataPage[row]}\n` )
await page.waitFor(100)
}
var paginatorValue = await page.evaluate ( () => document.querySelector('span.ui-paginator-current').innerText.split(' de ') )
lastPage = paginatorValue[0]
if ( paginatorValue[0] !== paginatorValue[1] ) {
await page.evaluate ( () => document.querySelector('a[aria-label="Next Page"]').click() )
await loadTableGrid ()
} else {
console.log ('SCRAPE ALL TABLE DATA FINISHED\nCLOSING PUPPETEER BROWSER!')
await browser.close ()
}
}
await loadTableGrid ()
})()