scrapy爬虫案例–爬取阳关热线问政平台
阳光热线问政平台://wz.sun0769.com/political/index/politicsNewest?id=1&page=1
爬取最新问政帖子的编号、投诉标题、投诉内容以及处理状态,并存入到Mongodb数据库中。
1、创建项目
scrapy startproject myProject
2、定义Item
items.py
import scrapy class MyprojectItem(scrapy.Item): number = scrapy.Field() #帖子编号 title = scrapy.Field() #帖子标题 content = scrapy.Field() #帖子内容 status = scrapy.Field() #处理状态
3、编写爬虫,提取item
首先使用如下命令生成爬虫文件sun.py
scrapy genspider sun wz.sun0769.com
spiders/sun.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from myProject.items import MyprojectItem class SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] start_urls = ['//wz.sun0769.com/political/index/politicsNewest?id=1&page=1'] rules = ( Rule(LinkExtractor(allow=r'id=\d+&page=\d+')),#每一页的匹配规则,默认follow=True跟进 Rule(LinkExtractor(allow=r'politics/index\?id=\d+'), callback='parse_item'),#每个帖子的匹配规则,默认不跟进 ) def parse_item(self, response): item = MyprojectItem() title = response.xpath('//div[@class="mr-three"]/p[@class="focus-details"]/text()').extract()[0] #投诉标题 status = response.xpath('//div[@class="focus-date clear focus-date-list"]/span[3]/text()').extract()[0].split()[1] #处理状态 number = response.xpath('//div[@class="focus-date clear focus-date-list"]/span[4]/text()').extract()[0].split(':')[-1] #投诉编号 content = response.xpath('//div[@class="details-box"]/pre/text()').extract()[0] #投诉内容 item['title'] = title item['status'] = status item['number'] = number item['content'] = content yield item
4、编写中间件文件middlewares.pyd中的下载中间件
给每个请求随机选择一个User-Agent,并且随机选择一个代理ip,另外写一个脚本定时获取代理ip和port,并存入redis数据库(比如使用芝麻ip)
import random import json import redis from myProject.settings import USER_AGENTS class MyprojectDownloaderMiddleware: def __init__(self): super().__init__() self.r = redis.StrictRedis()#创建redis数据库连接客户端,用于取里面存储的代理Ip和port def process_request(self, request, spider): proxy_list = json.loads(self.r.get('proxy_list').decode()) proxy = random.choice(proxy_list) request.headers['User-Agent'] = random.choice(USER_AGENTS) #设置user-agent request.meta['proxy'] ='//'+proxy['ip']+':'+str(proxy['port']) #设置代理ip和port return None
5、存储数据
编写pipelines.py文件
import pymongo class MyprojectPipeline: def open_spider(self,spider): self.client = pymongo.MongoClient('mongodb://localhost:27017')#创建mongodb数据库连接客户端 self.db = self.client.test #选择test数据库 self.collection = self.db.sunInfo #选择sunInfo集合存储数据 def process_item(self, item, spider): self.collection.insert_one(dict(item)) return item def close_spider(self,spider): self.client.close()
6、编写settings.py
BOT_NAME = 'myProject' SPIDER_MODULES = ['myProject.spiders'] NEWSPIDER_MODULE = 'myProject.spiders' ROBOTSTXT_OBEY = False COOKIES_ENABLED = False DOWNLOAD_DELAY = 1.0#设置延迟,减轻服务器压力,从而降低被查封的风险 #设置下载中间件 DOWNLOADER_MIDDLEWARES = { 'myProject.middlewares.MyprojectDownloaderMiddleware': 543, } #设置item管道 ITEM_PIPELINES = { 'myProject.pipelines.MyprojectPipeline': 300, } #可供选择的User-Agent,可通过网上查找更多,随机切换,起到反爬虫作用 USER_AGENTS = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ]
7、开启爬虫
scrapy crawl sun