关于node.js爬取Vue页面某数据爬取不到的问题?

爬取的目标页面:https://music.gala.com/artists/jaq

想要获取的数据:Total Listens

我的代码:

const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const info = [];
const hrefLine = [];

(async () => {
  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();
  await page.goto('https://music.gala.com/discover?page=artists', { waitUntil: 'networkidle0', timeout: 60000 });
  const html = await page.content();
  const $ = cheerio.load(html);
  const artists = $('.artist-card');
  for (const artist of artists) {
    const name = $(artist).find('.artist-card__artist').text().trim();
    const _name = name.replace(/[.\s!&]+/g, '-').toLowerCase();
    const href = `https://music.gala.com/artists/${_name}`;
    hrefLine.push(href);
  }
  await browser.close();

  await crawl(hrefLine);
  console.log(info);
})();

async function crawl(hrefLine) {
  const browser = await puppeteer.launch({args: ['--no-sandbox']});
  const page = await browser.newPage();

  for (let i = 0; i < hrefLine.length; i++) {
    const url = hrefLine[i];
    console.log(`Crawling ${url}...`);

    try {
      await page.goto(url, { waitUntil: 'networkidle0' });
      const html = await page.content();
      const $ = cheerio.load(html);
      const name1 = $('.self-start').eq(0).text();
      const collectorsDiv = $('div').filter(function() {
        return $(this).text().includes('Collectors');
      });

      const Total_Listens = $('div.leading-10 span').text();

      const level = $('#__nuxt div.hero div.capitalize').text();
      
      const Collectors = collectorsDiv.eq(11).next('div').text();
      console.log(`${level}艺术家:${name1}的歌曲总共被听了${Total_Listens}次,已被${Collectors}人收藏`);
      const per_art = {
        name1,
        level,
        Total_Listens,
        Collectors
      };
      info.push(per_art);
      console.log(per_art);
    } catch (error) {
      console.error(`Failed to crawl ${url}: ${error}`);
    }
  }

  await browser.close();
}

其他数据都可以爬取到,就这个Total Listens怎么爬都是空字符串,不知道是否由于双向绑定或者是虚拟dom的问题,请各位大佬赐教,十分感谢!

阅读 2k
1 个回答

SPA 渲染需要时间,所以不能只管顺序读写,要在每一步不断检查标志物,直到其出现,再往后执行。

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题