运行puppeteer时是否有办法将表数据转换为json?

时间:2019-12-17 19:17:54

标签: javascript screen-scraping puppeteer

我在尝试使用 puppeteer tabletojson 将html表转换为json时遇到了一些问题。

我正在做的事情非常简单,puppeteer进入chrome,创建了一个新页面,然后开始输入一些单词并点击了搜索按钮。当操纵up完成此任务时,屏幕上会出现一个表格,这就是为什么 tabletojson 不断向我显示一个空数组的原因。它不是与 puppeteer 一起运行,而是在 puppeteer 访问相同的URL之后运行,而没有任何以前的 puppeteer 交互,该交互没有任何表可读取

我要问的是:有没有办法在运行操纵up时将此表作为json获取?还是有某种方法可以在抓取时异步运行tabletojson?

const puppteer = require('puppeteer')
const tabletojson = require('tabletojson')

async function letscrap()
{
some puppeteer ....
  const browser = await puppteer.launch(
        {
            headless: false,
            defaultViewport: null
        }
    )
    
  const page = await browser.newPage()
  const url = "https://someurl.com/..."
  
 
some tabletojson ....

// here the ideia is to get json from tables...
  // at this point what happens is that it doesn't return anything due to this...
  
        await tabletojson.convertUrl(
            'https://someurl.com/...',
            {stripHtmlFromCells: false, stripHtmlFromCells: true },
            function(tablesAsJson) {
              console.log(tablesAsJson);
            }
        );
        
}

letscrap()

HTML来源:

<form method="post" action="/ConsultaPlanosConsumidor/pages/home.xhtml;jsessionid=bSbNLIiZ7pWWf7mgDl_MIw-F9QPRpVbNlo8johjO.ansprjboss01a:consulta-planos-consumidor-01a" enctype="application/x-www-form-urlencoded">

<!-- 

Some inputs right here
and... 

-->
<button id="formHome:tabOperadora:j_idt99" name="formHome:tabOperadora:j_idt99" class="ui-button ui-widget ui-state-default ui-corner-all ui-button-text-only" onclick="" type="submit" role="button" aria-disabled="false">
<span class="ui-button-text ui-c">Search</span>
</button>


<!-- 

When you hit "Search" it "appends" to it's div with a table.

-->

</form>

谢谢您的关注!

1 个答案:

答案 0 :(得分:0)

我认为您要收集的桌子不合适。 我发现结果页中至少有5个表。 因此,您必须为包含的数据表找到正确的选择器。 右边的选择器是table[role="grid"]。而且,您可以不使用tabletojson刮取搜索结果中的所有页面。我编写了此脚本,将搜索结果表数据写入CSV文件。

const puppeteer = require ('puppeteer')
const fs = require ('fs-extra')

const selectElementID_first = 'formHome:tabOperadora:solbSituacoesPrincipais'
const selectElementID_second = 'formHome:tabOperadora:solbAbrangenciasGeograficas'

const selectedOption_first = 'Liberada'
const selectedOption_second = 'Nacional'

const saveFileCSV = 'porpiano.csv'

let lastPage = 0

;(async () => {
    const browser = await puppeteer.launch ({
        headless : false,
        devtools : false
    })

    const [page] = await browser.pages ()

    const open = await page.goto ('http://www.ans.gov.br/ConsultaPlanosConsumidor/', { waitUntil: 'networkidle0', timeout: 0 })

    const porPianoClick = await page.evaluate ( () => document.querySelector('a[href="#formHome:tabOperadora:panelPorPlano"]').click() )

    // FIRST SELECT OPTION

    while ( !await page.evaluate( selectElementID_first => document.querySelector(`div[id="${selectElementID_first}"]`).classList.contains('ui-state-focus'), selectElementID_first ) ) {
        await page.keyboard.press('Tab')
        await page.waitFor(300)
    }

    await page.keyboard.press('Space')

    while ( await page.evaluate( selectElementID_first => document.querySelector(`div[id="${selectElementID_first}_panel"]`).style.display === 'none', selectElementID_first ) ) {
        await page.waitFor(300)
    }

    while ( await page.evaluate( (selectElementID_first, selectedOption_first) => document.querySelector(`div[id="${selectElementID_first}_panel"] ul > li.ui-state-highlight`).innerText !== selectedOption_first, selectElementID_first, selectedOption_first ) ) {
        await page.keyboard.press('ArrowDown')
        await page.waitFor(300)
    }

    await page.keyboard.press('Enter')

    await page.waitFor(1000)


    // SECOND SELECT OPTION

    while ( !await page.evaluate( selectElementID_second => document.querySelector(`div[id="${selectElementID_second}"]`).classList.contains('ui-state-focus'), selectElementID_second ) ) {
        await page.keyboard.press('Tab')
        await page.waitFor(300)
    }

    await page.keyboard.press('Space')

    while ( await page.evaluate( selectElementID_second => document.querySelector(`div[id="${selectElementID_second}_panel"]`).style.display === 'none', selectElementID_second ) ) {
        await page.waitFor(300)
    }

    while ( await page.evaluate( (selectElementID_second, selectedOption_second) => document.querySelector(`div[id="${selectElementID_second}_panel"] ul > li.ui-state-highlight`).innerText !== selectedOption_second, selectElementID_second, selectedOption_second ) ) {
        await page.keyboard.press('ArrowDown')
        await page.waitFor(300)
    }

    await page.keyboard.press('Enter')

    await page.waitFor(1000)


    await page.evaluate( () => document.querySelector('div[id="formHome:tabOperadora:panelPorPlano"] button[type="submit"]').click() )

    await fs.writeFile ( saveFileCSV, 'Número do Registro / Código do Plano, Nome Comercial do Plano, Segmentação Assistencial, Tipo de Contratação, Abrangência Geográfica, Tipo de Plano, Comercialização\n' )

    const loadTableGrid = async () => {
        // WAIT FOR PAGE LOADING

        await page.waitForSelector('span.ui-paginator-current', {timeout: 0})

        while ( lastPage === await page.evaluate ( () => document.querySelector('span.ui-paginator-current').innerText.split(' de ')[0] ) ) {
            await page.waitFor(250)
        }


        await page.waitForSelector('table[role="grid"]', {timeout: 0})

        // SCRAPE DATA

        var tableDataPage = await page.evaluate( () => {

            var tableDataPage = []

            var tableDataColumn1 = []
            var tableDataColumn2 = []
            var tableDataColumn3 = []
            var tableDataColumn4 = []
            var tableDataColumn5 = []
            var tableDataColumn6 = []
            var tableDataColumn7 = []

            document.querySelectorAll('td[role="gridcell"]:nth-of-type(1)').forEach( tableData => tableDataColumn1.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(2)').forEach( tableData => tableDataColumn2.push( `"${tableData.innerText}"` ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(3)').forEach( tableData => tableDataColumn3.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(4)').forEach( tableData => tableDataColumn4.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(5)').forEach( tableData => tableDataColumn5.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(6)').forEach( tableData => tableDataColumn6.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(7)').forEach( tableData => tableDataColumn7.push( tableData.innerText ) )

            for ( let num in tableDataColumn1 ) {
                tableDataPage[num] =
                    [
                        tableDataColumn1[num],
                        tableDataColumn2[num],
                        tableDataColumn3[num],
                        tableDataColumn4[num],
                        tableDataColumn5[num],
                        tableDataColumn6[num],
                        tableDataColumn7[num]
                    ]
            }

            return tableDataPage
        })

        // WRITE COLLECTED DATA TO CSV FORMAT

        for ( let row in tableDataPage ) {
            await fs.appendFile ( saveFileCSV, `${tableDataPage[row]}\n` )
            await page.waitFor(100)
        }

        var paginatorValue = await page.evaluate ( () => document.querySelector('span.ui-paginator-current').innerText.split(' de ') )

        lastPage = paginatorValue[0]

        if ( paginatorValue[0] !== paginatorValue[1] ) {
            await page.evaluate ( () => document.querySelector('a[aria-label="Next Page"]').click() )
            await loadTableGrid ()
        } else {
            console.log ('SCRAPE ALL TABLE DATA FINISHED\nCLOSING PUPPETEER BROWSER!')
            await browser.close ()
        }
    }

    await loadTableGrid ()

})()