我尝试抓取的网站阻止了我,因为我使用的是自动化工具,如何解决这个问题?

时间:2019-03-27 05:58:20

标签: node.js web-scraping puppeteer

当我尝试使用puppeteer抓取某些页面时,我被chegg.com阻止。有什么办法可以解决这个问题?也许在铬中使用隐身浏览器?

enter image description here

当我尝试登录并阻止我登录时会出现错误。

我还记得阅读有关标题的信息或删除标题以解决问题吗?

我只打算每分钟大约刮一次该页面,也许一个小时刮一次。不超级频繁。

const puppeteer = require("puppeteer");
const CREDS = require("./creds");
var SlackBot = require("slackbots");
var channel = "testing";
var tryAgain =
  "Try again, nerd. Post a plain text chegg link Ex. https://www.chegg.com/homework-help/questions-and-answers/assume-unc...";
var tryAgainTwo =
  "Try again, nerd. Post a plain text chegg link Ex. <https://www.chegg.com/homework-help/questions-and-answers/assume-unc>...";
var imageName = "workplease2";

var bot = new SlackBot({
  token: "--SNIP--",
  name: "cheggy"
});

// bot.on("start", function() {
//   bot.postMessageToChannel(channel, "Hello world!");
// });

bot.on("message", function(data) {
  if (data.type !== "message") {
    return;
  }

  handleMessage(data.text);
});

function handleMessage(message) {
  if (
    message.includes("chegg.com/") &&
    message.includes("Try again, nerd.") === false
  ) {
    bot.postMessageToChannel(
      channel,
      "Give me a sec to find that for you. XOXO"
    );
    run();
  } else {
    if (
      message === tryAgainTwo ||
      message.includes("sec to find") ||
      message.includes(imageName)
    ) {
      return;
    }
    bot.postMessageToChannel(channel, tryAgain);
    console.log("XXXXXXXXXXXXXXXXXXXXthis is the message!" + message);
    return;
  }
}

function sendGreeting() {
  var greeting = getGreeting();
  bot.postMessageToChannel(channel, greeting);
}

function getGreeting() {
  var greetings = [
    "hello!",
    "hi there!",
    "cheerio!",
    "how do you do!",
    "¡hola!"
  ];
  return greetings[Math.floor(Math.random() * greetings.length)];
}

async function timeout(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

async function run() {
  const browser = await puppeteer.launch({
    headless: false
  });

  const page = await browser.newPage();

  await page.goto("https://www.chegg.com/auth?action=login&reset_password=0");

  const USERNAME_SELECTOR = "#emailForSignIn";
  const PASSWORD_SELECTOR = "#passwordForSignIn";
  //const BUTTON_SELECTOR = '#eggshell-19 > form > div > div > div > footer > button';

  await page.click(USERNAME_SELECTOR);
  await page.keyboard.type(CREDS.email);

  await page.click(PASSWORD_SELECTOR);
  await page.keyboard.type(CREDS.password);

  //await page.click(BUTTON_SELECTOR);
  await page.keyboard.press(String.fromCharCode(13));

  await page.waitForNavigation();

  await page.goto(
    "https://www.chegg.com/homework-help/questions-and-answers/assume-uncle-holds-one-stock-east-coast-bank-ecb-thinks-little-risk-agree-stock-relatively-q9069609"
  );

  await timeout(1000);

  await page.screenshot({ path: "./myfolder/" + imageName, fullPage: true });

  browser.close();

  console.log("do you reach me?");

  var fs = require("fs");
  var request = require("request");

  var SLACK_TOKEN = "--SNIP--";
  var SLACK_CHANNEL = "general";
  var filepath = "./myfolder/workplease2.png";

  var options = {
    method: "POST",
    url: "https://slack.com/api/files.upload",
    headers: { "cache-control": "no-cache" },
    formData: {
      token: SLACK_TOKEN,
      channels: SLACK_CHANNEL,
      file: fs.createReadStream(filepath)
    }
  };

  request(options, function(error, response, body) {
    if (error) throw new Error(error);

    console.log(body);
  });
}

1 个答案:

答案 0 :(得分:0)

保存cookie,设置浏览器并降低伪造者的速度,使其达到人类的速度。

const browser = await puppeteer.launch({
        headless: false,
        devtools: true,
        slowMo: 250
        userDataDir: 'C:\\userData' // userDataDir <string> Path to a User Data Directory.
 });

 const page = await browser.pages();
 await page[0].setUserAgent('5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36');