scrapy-redis 分散式嗶哩嗶哩

  • 2020 年 1 月 16 日
  • 筆記

scrapy裡面,對每次請求的url都有一個指紋,這個指紋就是判斷url是否被請求過的。默認是開啟指紋即一個URL請求一次。如果我們使用分散式在多台機上面爬取數據,為了讓爬蟲的數據不重複,我們也需要一個指紋。但是scrapy默認的指紋是保持到本地的。所有我們可以使用redis來保持指紋,並且用redis裡面的set集合來判斷是否重複。

setting.py

# -*- coding: utf-8 -*-    # Scrapy settings for bilibili project  #  # For simplicity, this file contains only settings considered important or  # commonly used. You can find more settings consulting the documentation:  #  #     https://doc.scrapy.org/en/latest/topics/settings.html  #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html  #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html    BOT_NAME = 'bilibili'    SPIDER_MODULES = ['bilibili.spiders']  NEWSPIDER_MODULE = 'bilibili.spiders'      # Crawl responsibly by identifying yourself (and your website) on the user-agent  #USER_AGENT = 'bilibili (+http://www.yourdomain.com)'    # Obey robots.txt rules  # ROBOTSTXT_OBEY = True    # Configure maximum concurrent requests performed by Scrapy (default: 16)  #CONCURRENT_REQUESTS = 32    # Configure a delay for requests for the same website (default: 0)  # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay  # See also autothrottle settings and docs  DOWNLOAD_DELAY = 1  # The download delay setting will honor only one of:  #CONCURRENT_REQUESTS_PER_DOMAIN = 16  #CONCURRENT_REQUESTS_PER_IP = 16    # Disable cookies (enabled by default)  #COOKIES_ENABLED = False    # Disable Telnet Console (enabled by default)  #TELNETCONSOLE_ENABLED = False    # Override the default request headers:  DEFAULT_REQUEST_HEADERS = {    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',    'Accept-Language': 'en',  }    # Enable or disable spider middlewares  # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html  #SPIDER_MIDDLEWARES = {  #    'bilibili.middlewares.BilibiliSpiderMiddleware': 543,  #}    # Enable or disable downloader middlewares  # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html  DOWNLOADER_MIDDLEWARES = {      'bilibili.middlewares.BilibiliDownloaderMiddleware': 543,      'bilibili.middlewares.randomUserAgentMiddleware':400  }    # Enable or disable extensions  # See https://doc.scrapy.org/en/latest/topics/extensions.html  #EXTENSIONS = {  #    'scrapy.extensions.telnet.TelnetConsole': None,  #}    # Configure item pipelines  # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html  ITEM_PIPELINES = {     'bilibili.pipelines.BilibiliPipeline': 300,      'scrapy_redis.pipelines.RedisPipeline':300  }    # Enable and configure the AutoThrottle extension (disabled by default)  # See https://doc.scrapy.org/en/latest/topics/autothrottle.html  #AUTOTHROTTLE_ENABLED = True  # The initial download delay  #AUTOTHROTTLE_START_DELAY = 5  # The maximum download delay to be set in case of high latencies  #AUTOTHROTTLE_MAX_DELAY = 60  # The average number of requests Scrapy should be sending in parallel to  # each remote server  #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0  # Enable showing throttling stats for every response received:  #AUTOTHROTTLE_DEBUG = False    # Enable and configure HTTP caching (disabled by default)  # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings  #HTTPCACHE_ENABLED = True  #HTTPCACHE_EXPIRATION_SECS = 0  #HTTPCACHE_DIR = 'httpcache'  #HTTPCACHE_IGNORE_HTTP_CODES = []  #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'    SCHEDULER = 'scrapy_redis.scheduler.Scheduler'  DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  REDIS_URL = 'redis://@127.0.0.1:6379'  SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

  spider.py

# -*- coding: utf-8 -*-  import scrapy  import json,re  from bilibili.items import BilibiliItem    class BilibiliappSpider(scrapy.Spider):      name = 'bilibiliapp'      # allowed_domains = ['www.bilibili.com']      # start_urls = ['http://www.bilibili.com/']      def start_requests(self):              for i in range(1, 300):                    url = 'https://api.bilibili.com/x/relation/stat?vmid={}&jsonp=jsonp&callback=__jp3'.format(i)                  url_ajax = 'https://space.bilibili.com/{}/'.format(i)                  # get的時候是這個東東, scrapy.Request(url=, callback=)                  req = scrapy.Request(url=url,callback=self.parse,meta={'id':i})                  req.headers['referer'] = url_ajax                    yield req          def parse(self, response):          # print(response.text)          comm = re.compile(r'({.*})')          text = re.findall(comm,response.text)[0]          data = json.loads(text)          # print(data)          follower = data['data']['follower']          following = data['data']['following']          id = response.meta.get('id')          url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&page=1&pagesize=25'.format(id)          yield scrapy.Request(url=url,callback=self.getsubmit,meta={              'id':id,              'follower':follower,              'following':following          })        def getsubmit(self, response):          # print(response.text)          data = json.loads(response.text)          tilst = data['data']['tlist']          tlist_list = []          if tilst != []:              # print(tilst)              for tils in tilst.values():                  # print(tils['name'])                  tlist_list.append(tils['name'])          else:              tlist_list = ['無愛好']          follower = response.meta.get('follower')          following = response.meta.get('following')          id = response.meta.get('id')          url = 'https://api.bilibili.com/x/space/acc/info?mid={}&jsonp=jsonp'.format(id)          yield scrapy.Request(url=url,callback=self.space,meta={              'id':id,              'follower':follower,              'following':following,              'tlist_list':tlist_list          })        def space(self, respinse):          # print(respinse.text)          data = json.loads(respinse.text)          name = data['data']['name']          sex = data['data']['sex']          level = data['data']['level']          birthday = data['data']['birthday']          tlist_list = respinse.meta.get('tlist_list')          animation = 0          Life = 0          Music = 0          Game = 0          Dance = 0          Documentary = 0          Ghost = 0          science = 0          Opera = 0          entertainment = 0          Movies = 0          National = 0          Digital = 0          fashion = 0          for tlist in tlist_list:              if tlist == '動畫':                  animation = 1              elif tlist == '生活':                  Life = 1              elif tlist == '音樂':                  Music = 1              elif tlist == '遊戲':                  Game = 1              elif tlist == '舞蹈':                  Dance = 1              elif tlist == '紀錄片':                  Documentary = 1              elif tlist == '鬼畜':                  Ghost = 1              elif tlist == '科技':                  science = 1              elif tlist == '番劇':                  Opera =1              elif tlist == '娛樂':                  entertainment = 1              elif tlist == '影視':                  Movies = 1              elif tlist == '國創':                  National = 1              elif tlist == '數碼':                  Digital = 1              elif tlist == '時尚':                  fashion = 1          item = BilibiliItem()          item['name'] = name          item['sex'] = sex          item['level'] = level          item['birthday'] = birthday          item['follower'] = respinse.meta.get('follower')          item['following'] = respinse.meta.get('following')          item['animation'] = animation          item['Life'] = Life          item['Music'] = Music          item['Game'] = Game          item['Dance'] = Dance          item['Documentary'] = Documentary          item['Ghost'] = Ghost          item['science'] = science          item['Opera'] = Opera          item['entertainment'] = entertainment          item['Movies'] = Movies          item['National'] = National          item['Digital'] = Digital          item['fashion'] = fashion          yield item

設置ua池

from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware  import random    class randomUserAgentMiddleware(UserAgentMiddleware):        def __init__(self,user_agent=''):          self.user_agent = user_agent        def process_request(self, request, spider):          ua = random.choice(self.user_agent_list)          if ua:              request.headers.setdefault('User-Agent', ua)      user_agent_list = [           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"           "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",           "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",           "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",           "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",           "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",           "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"      ]

git地址:https://github.com/18370652038/scrapy-bilibili