一不小心,我爬取了100萬條微博評論

  • 2020 年 3 月 10 日
  • 筆記

↑ 關注 + 星標 ~ 有趣的不像個技術號

每晚九點,我們準時相約

鄭重聲明:本項目及所有相關文章,僅用於經驗技術交流,禁止將相關技術應用到不正當途徑,因為濫用技術產生的風險與本人無關

大家好,我是朱小五

今天給大家分享一篇文章用來學習,是關於微博評論的爬蟲。

作者宗傑,已經在源碼關鍵處做了注釋。

所以這可能是「凹凸數據」有史以來代碼最多、文字最少的一篇了

(溫馨提醒:完整代碼地址在文末!文末!)

下面見證奇蹟的時刻:

1、引入庫

import time  import base64  import rsa  import binascii  import requests  import re  from PIL import Image  import random  from urllib.parse import quote_plus  import http.cookiejar as cookielib  import csv  import os

2、一些全局變量的設置

comment_path = 'comment'  agent = 'mozilla/5.0 (windowS NT 10.0; win64; x64) appLewEbkit/537.36 (KHTML, likE gecko) chrome/71.0.3578.98 safari/537.36'  headers = {'User-Agent': agent}

3、創立目錄作為存放數據的

if not os.path.exists(comment_path):      os.mkdir(comment_path)

4、登陸類的創立

class WeiboLogin(object):      """      通過登錄 weibo.com 然後跳轉到 m.weibo.cn      """        # 初始化數據      def __init__(self, user, password, cookie_path):          super(WeiboLogin, self).__init__()          self.user = user          self.password = password          self.session = requests.Session()          self.cookie_path = cookie_path          # LWPCookieJar是python中管理cookie的工具,可以將cookie保存到文件,或者在文件中讀取cookie數據到程序          self.session.cookies = cookielib.LWPCookieJar(filename=self.cookie_path)          self.index_url = "http://weibo.com/login.php"          self.session.get(self.index_url, headers=headers, timeout=2)          self.postdata = dict()        def get_su(self):          """          對 email 地址和手機號碼 先 javascript 中 encodeURIComponent          對應 Python 3 中的是 urllib.parse.quote_plus          然後在 base64 加密後decode          """          username_quote = quote_plus(self.user)          username_base64 = base64.b64encode(username_quote.encode("utf-8"))          return username_base64.decode("utf-8")        # 預登陸獲得 servertime, nonce, pubkey, rsakv      def get_server_data(self, su):          """與原來的相比,微博的登錄從 v1.4.18 升級到了 v1.4.19          這裡使用了 URL 拼接的方式,也可以用 Params 參數傳遞的方式          """          pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su="          pre_url = pre_url + su + "&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.19)&_="          pre_url = pre_url + str(int(time.time() * 1000))          pre_data_res = self.session.get(pre_url, headers=headers)          # print("*"*50)          # print(pre_data_res.text)          # print("*" * 50)          sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", ''))            return sever_data        def get_password(self, servertime, nonce, pubkey):          """對密碼進行 RSA 的加密"""          rsaPublickey = int(pubkey, 16)          key = rsa.PublicKey(rsaPublickey, 65537)  # 創建公鑰          message = str(servertime) + 't' + str(nonce) + 'n' + str(self.password)  # 拼接明文js加密文件中得到          message = message.encode("utf-8")          passwd = rsa.encrypt(message, key)  # 加密          passwd = binascii.b2a_hex(passwd)  # 將加密信息轉換為16進制。          return passwd        def get_cha(self, pcid):          """獲取驗證碼,並且用PIL打開,          1. 如果本機安裝了圖片查看軟件,也可以用 os.subprocess 的打開驗證碼          2. 可以改寫此函數接入打碼平台。          """          cha_url = "https://login.sina.com.cn/cgi/pin.php?r="          cha_url = cha_url + str(int(random.random() * 100000000)) + "&s=0&p="          cha_url = cha_url + pcid          cha_page = self.session.get(cha_url, headers=headers)          with open("cha.jpg", 'wb') as f:              f.write(cha_page.content)              f.close()          try:              im = Image.open("cha.jpg")              im.show()              im.close()          except Exception as e:              print(u"請到當前目錄下,找到驗證碼後輸入")        def pre_login(self):          # su 是加密後的用戶名          su = self.get_su()          sever_data = self.get_server_data(su)          servertime = sever_data["servertime"]          nonce = sever_data['nonce']          rsakv = sever_data["rsakv"]          pubkey = sever_data["pubkey"]          showpin = sever_data["showpin"]  # 這個參數的意義待探索          password_secret = self.get_password(servertime, nonce, pubkey)            self.postdata = {              'entry': 'weibo',              'gateway': '1',              'from': '',              'savestate': '7',              'useticket': '1',              'pagerefer': "https://passport.weibo.com",              'vsnf': '1',              'su': su,              'service': 'miniblog',              'servertime': servertime,              'nonce': nonce,              'pwencode': 'rsa2',              'rsakv': rsakv,              'sp': password_secret,              'sr': '1366*768',              'encoding': 'UTF-8',              'prelt': '115',              "cdult": "38",              'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',              'returntype': 'TEXT'  # 這裡是 TEXT 和 META 選擇,具體含義待探索          }          return sever_data        def login(self):          # 先不輸入驗證碼登錄測試          try:              sever_data = self.pre_login()              login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)&_'              login_url = login_url + str(time.time() * 1000)              login_page = self.session.post(login_url, data=self.postdata, headers=headers)              ticket_js = login_page.json()              ticket = ticket_js["ticket"]          except Exception as e:              sever_data = self.pre_login()              login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)&_'              login_url = login_url + str(time.time() * 1000)              pcid = sever_data["pcid"]              self.get_cha(pcid)              self.postdata['door'] = input(u"請輸入驗證碼")              login_page = self.session.post(login_url, data=self.postdata, headers=headers)              ticket_js = login_page.json()              ticket = ticket_js["ticket"]          # 以下內容是 處理登錄跳轉鏈接          save_pa = r'==-(d+)-'          ssosavestate = int(re.findall(save_pa, ticket)[0]) + 3600 * 7          jump_ticket_params = {              "callback": "sinaSSOController.callbackLoginStatus",              "ticket": ticket,              "ssosavestate": str(ssosavestate),              "client": "ssologin.js(v1.4.19)",              "_": str(time.time() * 1000),          }          jump_url = "https://passport.weibo.com/wbsso/login"          jump_headers = {              "Host": "passport.weibo.com",              "Referer": "https://weibo.com/",              "User-Agent": headers["User-Agent"]          }          jump_login = self.session.get(jump_url, params=jump_ticket_params, headers=jump_headers)          uuid = jump_login.text            uuid_pa = r'"uniqueid":"(.*?)"'          uuid_res = re.findall(uuid_pa, uuid, re.S)[0]          web_weibo_url = "http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1" % uuid_res          weibo_page = self.session.get(web_weibo_url, headers=headers)            # print(weibo_page.content.decode("utf-8")            Mheaders = {              "Host": "login.sina.com.cn",              "User-Agent": agent          }            # m.weibo.cn 登錄的 url 拼接          _rand = str(time.time())          mParams = {              "url": "https://m.weibo.cn/",              "_rand": _rand,              "gateway": "1",              "service": "sinawap",              "entry": "sinawap",              "useticket": "1",              "returntype": "META",              "sudaref": "",              "_client_version": "0.6.26",          }          murl = "https://login.sina.com.cn/sso/login.php"          mhtml = self.session.get(murl, params=mParams, headers=Mheaders)          mhtml.encoding = mhtml.apparent_encoding          mpa = r'replace((.*?));'          mres = re.findall(mpa, mhtml.text)            # 關鍵的跳轉步驟,這裡不出問題,基本就成功了。          Mheaders["Host"] = "passport.weibo.cn"          self.session.get(eval(mres[0]), headers=Mheaders)          mlogin = self.session.get(eval(mres[0]), headers=Mheaders)          # print(mlogin.status_code)          # 進過幾次 頁面跳轉後,m.weibo.cn 登錄成功,下次測試是否登錄成功          Mheaders["Host"] = "m.weibo.cn"          Set_url = "https://m.weibo.cn"          pro = self.session.get(Set_url, headers=Mheaders)          pa_login = r'isLogin":true,'          login_res = re.findall(pa_login, pro.text)          # print(login_res)            # 可以通過 session.cookies 對 cookies 進行下一步相關操作          self.session.cookies.save()          # print("*"*50)          # print(self.cookie_path)

5、定義cookie的加載和信息的重定義

def get_cookies():      # 加載cookie      cookies = cookielib.LWPCookieJar("Cookie.txt")      cookies.load(ignore_discard=True, ignore_expires=True)      # 將cookie轉換成字典      cookie_dict = requests.utils.dict_from_cookiejar(cookies)      return cookie_dict    def info_parser(data):      id,time,text =  data['id'],data['created_at'],data['text']      user = data['user']      uid,username,following,followed,gender =           user['id'],user['screen_name'],user['follow_count'],user['followers_count'],user['gender']      return {          'wid':id,          'time':time,          'text':text,          'uid':uid,          'username':username,          'following':following,          'followed':followed,          'gender':gender      }

6、開始爬

def start_crawl(cookie_dict,id):      base_url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0'      next_url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}'      page = 1      id_type = 0      comment_count = 0      requests_count = 1      res = requests.get(url=base_url.format(id,id), headers=headers,cookies=cookie_dict)      while True:          print('parse page {}'.format(page))          page += 1          try:              data = res.json()['data']              wdata = []              max_id = data['max_id']              for c in data['data']:                  comment_count += 1                  row = info_parser(c)                  wdata.append(info_parser(c))                  if c.get('comments', None):                      temp = []                      for cc in c.get('comments'):                          temp.append(info_parser(cc))                          wdata.append(info_parser(cc))                          comment_count += 1                      row['comments'] = temp                  print(row)              with open('{}/{}.csv'.format(comment_path, id), mode='a+', encoding='utf-8-sig', newline='') as f:                  writer = csv.writer(f)                  for d in wdata:                      writer.writerow([d['wid'],d['time'],d['text'],d['uid'],d['username'],d['following'],d['followed'],d['gender']])                time.sleep(3)          except:              print(res.text)              id_type += 1              print('評論總數: {}'.format(comment_count))            res = requests.get(url=next_url.format(id, id, max_id,id_type), headers=headers,cookies=cookie_dict)          requests_count += 1          if requests_count%50==0:              print(id_type)          print(res.status_code)

7、主函數

if __name__ == '__main__':      username = "18100000000"  # 用戶名(註冊的手機號)      password = "123456"  # 密碼      cookie_path = "Cookie.txt"  # 保存cookie 的文件名稱      id = '4477416430959369'     # 爬取微博的 id      WeiboLogin(username, password, cookie_path).login()      with open('{}/{}.csv'.format(comment_path, id), mode='w', encoding='utf-8-sig', newline='') as f:          writer = csv.writer(f)          writer.writerow(['wid', 'time', 'text', 'uid', 'username', 'following', 'followed', 'gender'])      start_crawl(get_cookies(), id)

8、獲取id

你需要獲得想要找的微博id,那麼對於小白來說怎麼找id呢?看圖說話

首先找到你想爬的微博,這裡以微博故事為例,在瀏覽器內按下F12,並且點擊評論按鈕

點擊『網絡』,找到一條像圖中的get請求。查看它的參數,mid就是它的id