我正在使用node.js和puppeteer在搜寻器中工作,我的目标是获取表中两列的数据(日期和描述),代码工作正常,直到从块获取数据为止。 ..
下面的完整代码,包括我要爬网的页面的网址:
const fs = require('fs');
const puppeteer = require('puppeteer');
const urlConsulta = "http://www.tre-pr.jus.br/";
const numeroProcessoSeq = "000000889";
const numeroProcessoAno = "2014";
const numeroProcessoDigito = "6160047";
var wait = ms => new Promise((r, j)=> setTimeout(r, ms));
void (async () => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(urlConsulta);
await page.select('#acao', 'pesquisarNumUnico');
await page.evaluate((numeroProcessoSeq, numeroProcessoAno, numeroProcessoDigito) => {
document.getElementById('numUnicoSequencial').value = numeroProcessoSeq;
document.getElementById('numUnicoAno').value = numeroProcessoAno;
document.getElementById('numUnicoOrigem').value = numeroProcessoDigito;
}, numeroProcessoSeq, numeroProcessoAno, numeroProcessoDigito);
await page.$eval('form[action*="http://www.tre-pr.jus.br/@@processrequest"]', form => form.submit());
await page.waitForNavigation();
var frame = await page.frames().find(f => f.name() === 'ifr_servicos');
await frame.click('a[href*="ExibirDadosProcesso"]');
await page.frames().find(f => f.name() === 'ifr_servicos');
await wait(10000);
await frame.click('[name*="todos"]');
await frame.$eval('[name*="ExibirPartesProcessoZona"]', form => form.submit());
await wait(10000);
let string = await buscaFases(frame);
fs.writeFile("teste.txt", string, function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
console.log(string);
await wait(10000);
await browser.close();
} catch (error) {
console.log(error);
}
})();
async function buscaFases(frame) {
return await frame.evaluate(() => {
let div = document.querySelector('div[id*="conteudo"]');
let rowns = Array.from(div.children[4].children[0].children);
let movimentosInfo = rowns.map(row => {
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
return { data, descricao };
});
return JSON.stringify(movimentosInfo);
});
};
要获取数据的特定行:
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
答案 0 :(得分:0)
问题在于,并非所有tr
都具有您期望的子元素。这可能是因为带有td
标签的colspan。因此,您应该首先过滤数组以对其他元素进行排序。
将包括地图功能在内的行从let movimentosInfo = ...
更改为此:
let movimentosInfo = rowns.filter(row => {
return row.querySelector("tr td:first-child") && row.querySelector("tr td:first-child + td");
}).map(row => {
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
return { data, descricao };
});
这将添加一个过滤器功能,用于在映射其内容之前测试所需元素是否存在。