如何避免在 Puppeteer 和 Phantomjs 上被检测为机器人?

新手上路,请多包涵

Puppeteer 和 PhantomJS 类似。我遇到的问题都发生在两者上,代码也相似。

我想从网站上获取一些信息,这些信息需要身份验证才能查看这些信息。我什至无法访问主页,因为它被检测为像 SS 一样的“可疑活动”: https ://i.imgur.com/p69OIjO.png

我发现当我使用名为 Cookie 的标头在 Postman 上进行测试时,问题并没有发生,并且它的 cookie 的值在浏览器上被捕获,但是这个 cookie 会在一段时间后过期。所以我猜 Puppeteer/PhantomJS 都没有捕获 cookie,因为这个站点拒绝无头浏览器访问。

我能做些什么来绕过这个?

 // Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';

page.open(url, function (status) {
    if( status === "success") {
        page.render("home.png");
        phantom.exit();
    }
});

原文由 Felipe S. Fernandes 发布,翻译遵循 CC BY-SA 4.0 许可协议

阅读 1.4k
1 个回答

如果将来有人需要解决同样的问题。使用 puppeteer-extra

我已经在服务器上测试了代码。第二次运行时有谷歌验证码。您可以自行解决并重启机器人或使用验证码解决服务。

我确实运行代码超过 10 次没有 ip ban。我在继续运行时没有再次获得验证码。

但是你可以再次获取验证码!

 //sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]

const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))

async function run () {
  const browser = await puppeteer.launch({
    headless:(headless_mode !== 'true')? false : true,
    ignoreHTTPSErrors: true,
    slowMo: 0,
    args: ['--window-size=1400,900',
    '--remote-debugging-port=9222',
    "--remote-debugging-address=0.0.0.0", // You know what your doing?
    '--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
    ]})

  const page = await browser.newPage();

  console.log(`Testing expertflyer.com`)
  //await page.goto('https://www.expertflyer.com')
  await goto_Page('https://www.expertflyer.com')
  await waitForNetworkIdle(page, 3000, 0)
  //await page.waitFor(7000)
  await checking_error(do_2nd_part)

  async function do_2nd_part(){
    try{await page.click('#yui-gen2 > a')}catch{}
    await page.waitFor(5000)
    var seat = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer1.png'})

    await checking_error(do_3nd_part)
  }

  async function do_3nd_part(){
    try{await page.click('#yui-gen1 > a')}catch{}
    await page.waitFor(5000)
    var pro = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer2.png'})

    console.log(`All done, check the screenshots?`)
  }

  async function checking_error(callback){
    try{
      try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}

      if (error_found === 0) {
        console.log(`Error found`)
        var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
        var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
        try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
        try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}

        if (error_msg_details == captcha_msg) {
          console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)

          await verify_User_answer()
          await callback()
        } else if (error_msg_details == ip_blocked) {
          console.log(`The current ip address is blocked. The only way is change the ip address.`)
        } else {
          console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
          await page.waitFor(10000)
          await checking_error()
        }

      } else {
        console.log(`Page loaded successfully! You can do things here.`)
        await callback()
      }

    }catch{}
  }

  async function goto_Page(page_URL){
    try{
      await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
    } catch {
      console.log(`Error in loading page, re-trying...`)
      await goto_Page(page_URL)
    }
  }

  async function verify_User_answer(call_back){
      user_Answer = await readLine();

      if (user_Answer == 'yes') {
        console.log(`user_Answer is ${user_Answer}, Processing...`)
        // Not working what i want. Will fix later
        // Have to restart the bot after solving
        await call_back()
      } else {
        console.log(`answer not match. try again...`)

        var user_Answer = await readLine();
        console.log(`user_Answer is ${user_Answer}`)
        await verify_User_answer(call_back)
      }
    }

    async function readLine() {

      const rl = readline.createInterface({
        input: process.stdin,
        output: process.stdout
      });

      return new Promise(resolve => {

        rl.question('Solve the captcha and type yes to continue: ', (answer) => {
          rl.close();
          resolve(answer)
        });
      })
    }

  async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
  console.log('waitForNetworkIdle called')
  page.on('request', onRequestStarted);
  page.on('requestfinished', onRequestFinished);
  page.on('requestfailed', onRequestFinished);

  let inflight = 0;
  let fulfill;
  let promise = new Promise(x => fulfill = x);
  let timeoutId = setTimeout(onTimeoutDone, timeout);
  return promise;

  function onTimeoutDone() {
    page.removeListener('request', onRequestStarted);
    page.removeListener('requestfinished', onRequestFinished);
    page.removeListener('requestfailed', onRequestFinished);
    fulfill();
  }

  function onRequestStarted() {
    ++inflight;
    if (inflight > maxInflightRequests)
      clearTimeout(timeoutId);
  }

  function onRequestFinished() {
    if (inflight === 0)
      return;
    --inflight;
    if (inflight === maxInflightRequests)
      timeoutId = setTimeout(onTimeoutDone, timeout);
  }
}

  await browser.close()
}
run();

请注意“解决验证码并输入 yes 以继续:”方法未按预期工作,需要一些修复。

编辑:10 分钟后重新运行机器人再次获得验证码。在 chrome://inspect/#devices 上解决了验证码--- 重新启动了机器人,一切都重新开始了。没有ip禁令。

原文由 Shimul D 发布,翻译遵循 CC BY-SA 4.0 许可协议

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进