节点:无法刮擦公共Tableau仪表板?

时间:2020-07-11 11:39:28

标签: node.js web-scraping cheerio tableau-api

所以我复制了该question中的步骤并将其转换为node.js

...
app.use('/', async (request, response) => {
    const fetchedSite = await fetch('https://public.tableau.com/views/COVID-19CasesandDeathsinthePhilippines_15866705872710/Home?%3Aembed=y&%3AshowVizHome=no&%3Adisplay_count=y&%3Adisplay_static_image=y&%3AbootstrapWhenNotified=true&%3Alanguage=en&:embed=y&:showVizHome=n&:apiID=host0#navType=0&navSrc=Parse')
    const siteText = await fetchedSite.text()
    const $ = cheerio.load(siteText)
    const tsConfigJson = JSON.parse($('#tsConfigContainer').text())

    const body = {
        sheet_id: tsConfigJson.sheetId
    }

    const getTableauData = await fetch(`https://public.tableau.com${tsConfigJson.vizql_root}/bootstrapSession/sessions/${tsConfigJson.sessionid}`, {
        method: 'POST',
        body: JSON.stringify(body)
    })

    return response.status(200).send(getTableauData)
...

我得到的唯一答复是

{"size":0,"timeout":0}

状态:500

statusText:内部服务器错误

我在这里想念东西吗?

1 个答案:

答案 0 :(得分:1)

问题是您试图发送json,但它必须是表单数据:

const body = new URLSearchParams();
body.append('sheet_id', tsConfigJson.sheetId);

const tableauData = await fetch(`https://public.tableau.com${tsConfigJson.vizql_root}/bootstrapSession/sessions/${tsConfigJson.sessionid}`, {
    method: 'POST',
    body: body
})

获取数据的完整代码:

const fetch = require('node-fetch');
const cheerio = require('cheerio');

const url = 'https://public.tableau.com/views/COVID-19CasesandDeathsinthePhilippines_15866705872710/Home?';
const params = new URLSearchParams({ 
    ":embed": "y",
    ":showVizHome": "no",
    ":display_count": "y",
    ":display_static_image": "y",
    ":bootstrapWhenNotified": true,
    ":language": "en",
    ":embed": "y",
    ":showVizHome": "n",
    ":apiID": "host0" 
});

(async () => {
    const site = await fetch(url + params);
    var text = await site.text();
    const $ = cheerio.load(text);
    const tsConfigJson = JSON.parse($('#tsConfigContainer').text());

    const body = new URLSearchParams();
    body.append('sheet_id', tsConfigJson.sheetId);

    const tableauData = await fetch(`https://public.tableau.com${tsConfigJson.vizql_root}/bootstrapSession/sessions/${tsConfigJson.sessionid}`, {
        method: 'POST',
        body: body
    })
    text = await tableauData.text();
    var jsonRegex = /\d+;({.*})\d+;({.*})/g;
    var match = jsonRegex.exec(text);
    const info = JSON.parse(match[1]);
    const data = JSON.parse(match[2]);
    console.log(data.secondaryInfo.presModelMap.dataDictionary.presModelHolder.genDataDictionaryPresModel.dataSegments["0"].dataColumns)
})();

Try this on repl.it