抓取个人微博 之 Ajax 数据爬取
- 2019 年 10 月 10 日
- 笔记
版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/weixin_40313634/article/details/90141841
抓取网站:https://m.weibo.cn/u/2830678474
代码
from urllib.parse import urlencode import requests import json import os from pyquery import PyQuery as pq base_url = 'https://m.weibo.cn/api/container/getIndex?' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'x-requested-with': 'XMLHttpRequest' } # 抓取单个页面 def get_page(page): parms = { 'uid' : '2830678474', 'luicode' : '10000011', 'lfid' : '1076032830678474', 'type' : 'uid', 'value' : '2830678474', 'containerid' : '1076032830678474', 'page' : page } url = base_url + urlencode(parms) try: response = requests.get(url, headers = headers) if response.status_code == 200: return response.json() except requests.ConnectionError as e: print('Error', e.args) return None # 解析网页内容 def parse_page(json): weibo = [] if json: items = json.get('data').get('cards') for item in items: mblog = item.get('mblog') if mblog != None: text = pq(mblog.get('text')).text() weibo.append(text + 'nt') return weibo # 获取微博总页数 def sum_page(): json = get_page(1) if json: total_item = json.get('data').get('cardlistInfo').get('total') sum = int(total_item / 10) + 1 else: sum = 100 return sum if __name__ == '__main__': sum = sum_page() for page in range(sum): data = get_page(page) weibo = parse_page(data) # 保存解析的内容到txt里 with open('weibo.txt', 'a', encoding = 'utf-8') as f: for t in weibo: f.write(t) # 保存微博每页的数据 file = os.path.join(os.getcwd(), 'tmp', str(page) + '.json') with open(file, 'w', encoding = 'utf-8') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False))```