使用puppeteer抓取网站数据

  • 2019 年 11 月 6 日
  • 笔记

记一下使用puppeteer抓取开源中国上的推荐软件数据

1.安装 npm install puppeteer

2.引入

const puppeteer = require('puppeteer');

3.抓取代码

const sleep = time => new Promise(resolve => {      setTimeout(resolve, time);  })    const url = `https://h5.oschina.net`;  ;(async () => {      console.log('Start visit');        const brower = await puppeteer.launch({          args: ['--no-sandbox', '--disable-setuid-sandbox'],          dumpio: false      });        const page = await brower.newPage()   // 开启一个新页面        await page.goto(url, {          waitUntil: 'networkidle2'  // 网络空闲说明已加载完毕      });        //加载jQuery      await page          .mainFrame()          .addScriptTag({              url: 'https://cdn.bootcss.com/jquery/3.2.0/jquery.min.js'          })        await sleep(1000);        // 编辑推荐内容      await page.waitForSelector('.osc-list');        // 结果      const result = await page.evaluate(() => {          //获取的数据数组          let dataTemp = [];            let articles = $('.project-item');            for (let i = 0; i < articles.length; i++) {              let article = articles[i];              let descDoms = $(article).find('.osc-cell__title');              let name = descDoms.find('.project-item__name').text();              let title = descDoms.find('.project-item__desc').text();              let desc = descDoms.find('.content').text();              let ident = title.toLowerCase();                dataTemp.push({                  name: name,                  title: title,                  desc: desc,                  ident: ident              });          }          return dataTemp;      });      // 关闭浏览器      brower.close();      console.log(result);  })();

3.运行 node app.js