使用puppeteer抓取网站数据
- 2019 年 11 月 6 日
- 笔记
记一下使用puppeteer抓取开源中国上的推荐软件数据
1.安装 npm install puppeteer
2.引入
const puppeteer = require('puppeteer');
3.抓取代码
const sleep = time => new Promise(resolve => { setTimeout(resolve, time); }) const url = `https://h5.oschina.net`; ;(async () => { console.log('Start visit'); const brower = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], dumpio: false }); const page = await brower.newPage() // 开启一个新页面 await page.goto(url, { waitUntil: 'networkidle2' // 网络空闲说明已加载完毕 }); //加载jQuery await page .mainFrame() .addScriptTag({ url: 'https://cdn.bootcss.com/jquery/3.2.0/jquery.min.js' }) await sleep(1000); // 编辑推荐内容 await page.waitForSelector('.osc-list'); // 结果 const result = await page.evaluate(() => { //获取的数据数组 let dataTemp = []; let articles = $('.project-item'); for (let i = 0; i < articles.length; i++) { let article = articles[i]; let descDoms = $(article).find('.osc-cell__title'); let name = descDoms.find('.project-item__name').text(); let title = descDoms.find('.project-item__desc').text(); let desc = descDoms.find('.content').text(); let ident = title.toLowerCase(); dataTemp.push({ name: name, title: title, desc: desc, ident: ident }); } return dataTemp; }); // 关闭浏览器 brower.close(); console.log(result); })();
3.运行 node app.js