我遇到了找不到该节点的错误。我改用jQuery。没运气。我现在就在这。 page.text无效。我收到了节点错误。我正在尝试通过提供案例编号来刮除https://web6.seattle.gov/courts/ECFPortal/Default.aspx上的案例信息和文件。
const Apify = require('apify');
const {
puppeteer
} = Apify.utils;
const saveScreen = async(page, key = 'debug-screen') = > {
const screenshotBuffer = await page.screenshot({
fullPage: true
});
await Apify.setValue(key, screenshotBuffer, {
contentType: 'image/png'
});
};
Apify.main(async() = > {
// Launch Puppeteer
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
await page.addScriptTag({
url: 'https://code.jquery.com/jquery-3.2.1.min.js'
});
await page.waitForFunction(() = > window.jQuery);
page.evaluate(() = > $('span:contains("Case Information")').click());
//await page.waitForNavigation();
await page.waitFor(4000);
const input = await Apify.getInput()
console.log('json stringify input: ' + JSON.stringify(input))
const caseNumber = input['court_case'];
console.log('CASE NUMBER: ' + caseNumber)
var html = await page.$eval('body', e = > e.outerHTML);
const output2 = {
html,
crawledAt: new Date(),
};
await Apify.setValue('HTMltestOUTPUT', output2);
console.log('html to test.');
page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_txtCaseNumber').val("585344"));
await saveScreen(page, 'test-screen');
await page.waitFor(1000);
console.log('Attempted to enter case number');
page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_btnSearch').click());
console.log('Attempted to click button');
// Times-out here
//await page.waitForNavigation();
console.log('Attempted to wait for navigation');
// Get cookies
const cookies = await page.cookies();
console.log('Attempted to wait for cookies');
var html = await page.$eval('body', e = > e.outerHTML);
// And then save output
const output = {
html,
crawledAt: new Date(),
};
console.log('My output:');
console.dir(output);
await Apify.setValue('OUTPUT', output);
await browser.close();
console.log('Done.');
});
答案 0 :(得分:0)
您的代码的主要问题是,该网站是一个单页aspx应用程序,它不进行任何导航,并通过XHR请求加载所有内容。因此,每个page.waitForNavigation调用始终会超时。
您可以通过等待页面上的元素可见或跟踪网络请求来解决此问题。考虑到这一点,我已经重写了您的代码,并制作了使用这两种方法的功能版本。希望这对您有帮助:
const Apify = require('apify');
const { puppeteer } = Apify.utils;
const saveScreen = async(page, key = 'debug-screen') => {
const screenshotBuffer = await page.screenshot({
fullPage: true
});
await Apify.setValue(key, screenshotBuffer, {
contentType: 'image/png'
});
};
const saveHtml = async (page, key = 'output', logOutput = false) => {
const html = await page.$eval('body', e => e.outerHTML);
const output = {
html,
crawledAt: new Date(),
};
if (logOutput) {
console.log('My output:');
console.dir(output);
}
return Apify.setValue(key, output);
};
Apify.main(async() => {
const input = await Apify.getInput()
console.log('json stringify input: ' + JSON.stringify(input));
// Get case number from input or use default (for testing)
const caseNumber = input && input.court_case || '585344';
console.log('CASE NUMBER: ' + caseNumber)
// Launch Puppeteer
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
console.log('Page opened');
// Wait for the link in menu to appear and then click on it
await page.waitForSelector('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
await page.click('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
console.log('Redirecting to case information');
// Wait for the new page to load and input to appear
await page.waitForSelector('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', { visible: true });
console.log('Inputing case number');
// Input the case number
await page.type('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', caseNumber, { delay: 20 })
// Save current html and screenshot for debugging
await saveScreen(page, 'search-screen');
await saveHtml(page, 'search-html');
// Prepare waitForResponse promise, we need to do it here, because after clicking on
// button it might be too late.
const waitForResponsePromise = page.waitForResponse((response) => {
return response.url().includes('courts/ECFPortal/Default.aspx');
});
console.log('clicking on search');
// Click on the search button
await page.click('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_btnSearch');
// Wait for the xhr request to finish, this means that the case information should be loaded
await waitForResponsePromise;
await page.waitFor(500);
console.log('Case information loaded');
// Save current html and screenshot for debugging
await saveScreen(page, 'output-screen');
await saveHtml(page, 'output', true);
await browser.close();
console.log('Done.');
});