Node js爬虫

时间:2019-02-15 10:53:44

标签: node.js typescript web-scraping puppeteer cheerio

我已经用打字稿写了一个刮板,在 node:10.12.0 上运行

问题:该代码会在几个小时后随机进入睡眠状态。我不得不重新启动它。我最好的猜测是它卡在 url请求

使用以下工具/软件包:

  • 木偶
  • Cheerio
  • 打字稿

代码:

import * as cheerio from "cheerio";
import * as request from "request";
import * as fs from "fs";
import * as shell from "shelljs";
import pup = require("puppeteer");
class App {
    // @ts-ignore
    public browser: pup.Browser;
    public appendToFile(file: string, content: string): Promise < string > {
        return new Promise < string > ((resolve, reject) => {
            try {
                fs.appendFileSync(file, content);
                resolve("DONE");
            } catch (e) {
                reject(e);
            }
        });
    }
    public loadPage(url: string): Promise < any > {
        return new Promise < any > ((resolve, reject) => {
            request.get(url, async (err, res, html) => {
                if (!err && res.statusCode === 200) {
                    resolve(html);
                } else {
                    if (err) {
                        reject(err);
                    } else {
                        reject(res);
                    }
                }
            });
        });
    }
    public step1(url: string): Promise < string > {
        return new Promise < string > (async (resolve, reject) => {
            let page: pup.Page | undefined;
            try {
                let next = false;
                let urlLink = url;
                let first = true;
                let header = "unknown";
                let f = url.split("/");
                let folder = f[f.length - 3];
                folder = folder || header;
                let path = "data/" + folder;
                shell.mkdir("-p", path);
                page = await this.browser.newPage();

                await page.goto(url, {
                    timeout: 0
                });
                let count = 1;
                do {
                    next = false;
                    let res = await page.evaluate(() => {
                        let e = document.querySelectorAll(".ch-product-view-list-container.list-view li ul > li > h6 > a");
                        let p: string[] = [];
                        e.forEach((v) => {
                            p.push(("https://www.link.com") + (v.getAttribute("href") as string));
                        });
                        return p;
                    });

                    // for(const l of res) {
                    //     try {
                    //         await this.step2(l, "" , "")
                    //     } catch(er) {
                    //         this.appendToFile("./error.txt", l + "::" + url + "\n").catch(e=>e)
                    //     }
                    // }

                    let p = [];
                    let c = 1;
                    for (const d of res) {
                        p.push(await this.step2(d, folder, c.toString()).catch((_e) => {
                            console.log(_e);
                            fs.appendFileSync("./error-2.txt", urlLink + " ### " + d + "\n");
                        }));
                        c++;
                    }
                    await Promise.all(p);

                    await this.appendToFile("./processed.txt", urlLink + ":" + count.toString() + "\n").catch(e => e);
                    count++;
                    console.log(urlLink + ":" + count);
                    let e = await page.evaluate(() => {
                        let ele = document.querySelector("#pagination-next") as Element;
                        let r = ele.getAttribute("style");
                        return r || "";
                    });
                    if (e === "") {
                        next = true;

                        await page.click("#pagination-next");
                        // console.log('waitng')
                        await page.waitFor(1000);
                        // console.log('done wait')
                        // await page.waitForNavigation({waitUntil: 'load'}).catch(e=> console.log(e));
                        //     await Promise.all([
                        //         page.click("#pagination-next"),
                        //         page.waitForNavigation({ waitUntil: 'networkidle0'}),                //   ]);
                    }
                } while (next);
                // await page.close();
                resolve("page all scrapped");
            } catch (errrr) {
                reject(errrr);
            } finally {
                if (page !== undefined) {
                    await page.close().catch(e => e);
                }
            }
        });
    }
    public step2(url: string, folder: string, file: string): Promise < string > {
        return new Promise < string > (async (resolve, reject) => {
            try {
                let html = await this.loadPage(url).catch(e => reject(e));
                let $ = cheerio.load(html);
                let ress: any = {};
                let t = $(".qal_title_heading").text();
                if (t) {
                    ress.header = t.replace(/"/g, "'").replace(/\n|\r|\t/g, "");
                }
                let d = $("div.ch_formatted_text.qal_thread-content_text.asker").html();
                if (d) {
                    ress.body = d.replace(/"/g, "'").replace(/\n|\r|\t/g, "");
                }
                // let sprit = "-------------------------------";
                let filename = "data" + file + ".json"; // ((t.replace(/[^\w\s]/gi, "")).substring(0,250)+".txt")
                let data = JSON.stringify(ress) // t +sprit + d + "\n---end---\n";                await this.appendToFile("./data/"+ folder + "/" +filename, data+",\n")
                    .then((r) => {
                        resolve(r);
                    });
            } catch (err) {
                reject(err);
            }
        });
    }
}
async function main() {
    process.on("SIGTERM", () => {
        console.log("SigTerm received");
        process.exit(1);
    });
    process.on("SIGINT", () => {
        console.log("SigInt received");
        process.exit(1);
    });
    let path = "data/unknown";
    shell.mkdir("-p", path);
    let c = new App();
    let list: string[] = [];
    console.log(process.argv[2]);
    require("fs").readFileSync(process.argv[2], "utf-8").split(/\r?\n/).forEach((line: string) => {
        list.push(line);
    });
    console.log("total links->" + list.length);

    c.browser = await pup.launch({
        headless: true
    });
    for (const l of list) {
        await c.step1(l).then(e => {
            fs.appendFileSync("./processed.txt", l);
        }).catch(e => {
            fs.appendFileSync("./error.txt", l);
        });
    }
}
main();

让我知道您是否还需要我提供其他服务。这也是所有代码。

1 个答案:

答案 0 :(得分:0)

所以,我想到了两个问题。

  1. Chrome(在操纵up下)消耗大量CPU,这是这样的趋势:

    开始时使用率适中。并逐渐增加。我的趋势是开始时使用率为4%,一天后达到100%。我已经针对他们的git

  2. 提交了问题
  3. 我没有在请求中指定超时

    是:

    request.get(url, async (err, res, html) => { 
    

    应为:

    request.get(url,{timeout: 1500} async (err, res, html) => {
    

到目前为止,我的代码已经运行了一天以上。唯一的问题是CPU使用率高。但这与我现在无关。