操纵,,通过xpath选择的链接循环

时间:2018-06-21 22:08:10

标签: xpath puppeteer

我是puppeteer的新手(通常对javascript不太了解),并且正在尝试为以下内容编写一些基本功能:

  • 从XPath获取所有链接
  • 浏览并单击那些链接
  • 截屏并保存页面的HTML
  • 返回屏幕快照,然后将记录页面的HTML保存到其他页面的同一目录中,然后重新开始该过程

我得到的错误是:

  

评估失败:DOMException:无法对“文档”执行“ querySelector”:“ 0”不是有效的选择器

这是我的代码:

enter image description here

我非常有信心所有代码都能正常工作,除了我可以用XPath单击正确的问题之外。我从中获得这些信息的网站是:

https://hrlb.oregon.gov/bspa/licenseelookup/searchdir.asp?searchby=lastname&searchfor=a&stateselect=none&Submit=Search

代码:

const records = await page.$x('//table[2]//tr[td[a]]//td[1]/a');
let int = 0;
for (let record in records) {
    await Promise.all([
        page.waitForNavigation(),
        page.click(record)
    ]);

    await Promise.all([makeDirectory('screenshots/item'+int), makeDirectory('screenshots/item'+int+'/base'), makeDirectory('screenshots/item'+int+'/record')]);
    let recordPath = "screenshots/item"+int+"/record/record.html";
    let basePath = "screenshots/item"+int+"/base/base.html";

    page.screenshot({path: "screenshots/item"+int+"/record/record.png", fullPage: true});
    let recordBody = await page.evaluate(() => document.body.innerHTML);
    await saveHtml(recordPath, recordBody);

    await Promise.all([
        page.waitForNavigation(),
        page.goBack()
    ]);

    await page.screenshot({path: "screenshots/item"+int+"/base/base.png", fullPage: true});
    let baseBody = await page.evaluate(() => document.body.innerHTML);
    await saveHtml(basePath, baseBody);

    int++;
    console.log(record);
}

async function makeDirectory(path) {
    mkdirp(path, function(err) {
        if (err) throw err;
    });
};

async function saveHtml(path, html) {
    await fs.writeFile(path, html, (err) => {
        if (err) throw err;
    });
};

注意:我需要使用XPath:(

更新了6/25/18 现在,这给了我来自xpath选择器的所有链接。然后我进行迭代,然后仅使用page.goto转到正确的站点。

const linksXPath = '//table[2]//tr[td[a]]//td[1]/a';
const links = await page.evaluate((selector) => {
    let results = [];
let query = document.evaluate(selector,
  document,
  null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
  for (let i=0, length=query.snapshotLength; i<length; ++i) {
    results.push(query.snapshotItem(i).href);
  }
    return results;
}, linksXPath);

2 个答案:

答案 0 :(得分:0)

我认为这是您的选择器。

我相信您的表格选择器应为:

"body > table > tbody > tr:nth-child(2) > td > table > tbody > tr:nth-child(1) > td > table.bodytext > tbody"

获取页面正确选择器的最简单方法是使用Chrome开发工具。

检查页面,然后转到“元素”选项卡。从那里,您应该看到所有HTML元素。右键单击您想要的那个(我去过<tbody>,因此您可以遍历<tr>元素。),然后选择copy>复制选择器。

答案 1 :(得分:0)

我的代码现在正在执行所需的操作,但是我希望有一种更简单的方法可以执行此操作。此外,当我遍历链接时,您会看到我正在使用page.goto函数去那里。我仍然不知道使用page.click的方法。我将不得不使用xpath来获取所有td,然后单击它们,但我始终无法使它工作。这是有效的产品:

const puppeteer = require('puppeteer');
const fs = require('fs');
const mkdirp = require('mkdirp');

async function run() {
    const pageToClick = 'body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > form > table > tbody > tr:nth-child(3) > td > div > input[type="submit"]';
    const select = 'body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > form > table > tbody > tr:nth-child(1) > td:nth-child(2) > select';
    const inputField = 'body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > form > table > tbody > tr:nth-child(2) > td:nth-child(2) > input[type="text"]:nth-child(1)';
    const linksXPath = '//table[2]//tr[td[a]]//td[1]/a';
    const browser = await puppeteer.launch({
        headless: true
    });
    const page = await browser.newPage();
    await page.goto('https://hrlb.oregon.gov/bspa/licenseelookup/');
    await page.select(select, 'lastname');
    await page.focus(inputField);
    await page.keyboard.type('a');
    await Promise.all([
        page.waitForNavigation(),
        page.click(pageToClick)
    ]);

    const links = await page.evaluate((selector) => {
        let results = [];
        let query = document.evaluate(selector,
            document,
            null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
        for (let i=0, length=query.snapshotLength; i<length; ++i) {
            results.push(query.snapshotItem(i).href);
        }
        return results;
    }, linksXPath);
    const basePic = await page.screenshot({fullPage: true});
    let baseBody = await page.evaluate(() => document.body.innerHTML);
    let int = 0;
    for (i = 0; i < links.length; i++) {
        await Promise.all([
            page.waitForNavigation(),
            page.goto(links[i])
        ]);

        await Promise.all([makeDirectory('screenshots/item'+int), makeDirectory('screenshots/item'+int+'/base'), makeDirectory('screenshots/item'+int+'/record')]);
        let recordPath = "screenshots/item"+int+"/record/record.html";
        let basePath = "screenshots/item"+int+"/base/base.html";
        let basePicPath = "screenshots/item"+int+"/base/base.png";

        await page.screenshot({path: "screenshots/item"+int+"/record/record.png", fullPage: true});
        let recordBody = await page.evaluate(() => document.body.innerHTML);
        await saveFile(recordPath, recordBody);

        await Promise.all([
            page.waitForNavigation(),
            page.goBack()
        ]);

        await saveFile(basePath, baseBody);
        await saveFile(basePicPath, basePic);

        int++;
    }
    await page.close();
    await browser.close();
}

async function makeDirectory(path) {
    mkdirp(path, function(err) {
        if (err) throw err;
    });
};

async function saveFile(path, html) {
    await fs.writeFile(path, html, (err) => {
        if (err) throw err;
    });
};

run();