scrapy爬蟲案例–爬取陽關熱線問政平台
陽光熱線問政平台://wz.sun0769.com/political/index/politicsNewest?id=1&page=1
爬取最新問政帖子的編號、投訴標題、投訴內容以及處理狀態,並存入到Mongodb資料庫中。
1、創建項目
scrapy startproject myProject
2、定義Item
items.py
import scrapy class MyprojectItem(scrapy.Item): number = scrapy.Field() #帖子編號 title = scrapy.Field() #帖子標題 content = scrapy.Field() #帖子內容 status = scrapy.Field() #處理狀態
3、編寫爬蟲,提取item
首先使用如下命令生成爬蟲文件sun.py
scrapy genspider sun wz.sun0769.com
spiders/sun.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from myProject.items import MyprojectItem class SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] start_urls = ['//wz.sun0769.com/political/index/politicsNewest?id=1&page=1'] rules = ( Rule(LinkExtractor(allow=r'id=\d+&page=\d+')),#每一頁的匹配規則,默認follow=True跟進 Rule(LinkExtractor(allow=r'politics/index\?id=\d+'), callback='parse_item'),#每個帖子的匹配規則,默認不跟進 ) def parse_item(self, response): item = MyprojectItem() title = response.xpath('//div[@class="mr-three"]/p[@class="focus-details"]/text()').extract()[0] #投訴標題 status = response.xpath('//div[@class="focus-date clear focus-date-list"]/span[3]/text()').extract()[0].split()[1] #處理狀態 number = response.xpath('//div[@class="focus-date clear focus-date-list"]/span[4]/text()').extract()[0].split(':')[-1] #投訴編號 content = response.xpath('//div[@class="details-box"]/pre/text()').extract()[0] #投訴內容 item['title'] = title item['status'] = status item['number'] = number item['content'] = content yield item
4、編寫中間件文件middlewares.pyd中的下載中間件
給每個請求隨機選擇一個User-Agent,並且隨機選擇一個代理ip,另外寫一個腳本定時獲取代理ip和port,並存入redis資料庫(比如使用芝麻ip)
import random import json import redis from myProject.settings import USER_AGENTS class MyprojectDownloaderMiddleware: def __init__(self): super().__init__() self.r = redis.StrictRedis()#創建redis資料庫連接客戶端,用於取裡面存儲的代理Ip和port def process_request(self, request, spider): proxy_list = json.loads(self.r.get('proxy_list').decode()) proxy = random.choice(proxy_list) request.headers['User-Agent'] = random.choice(USER_AGENTS) #設置user-agent request.meta['proxy'] ='//'+proxy['ip']+':'+str(proxy['port']) #設置代理ip和port return None
5、存儲數據
編寫pipelines.py文件
import pymongo class MyprojectPipeline: def open_spider(self,spider): self.client = pymongo.MongoClient('mongodb://localhost:27017')#創建mongodb資料庫連接客戶端 self.db = self.client.test #選擇test資料庫 self.collection = self.db.sunInfo #選擇sunInfo集合存儲數據 def process_item(self, item, spider): self.collection.insert_one(dict(item)) return item def close_spider(self,spider): self.client.close()
6、編寫settings.py
BOT_NAME = 'myProject' SPIDER_MODULES = ['myProject.spiders'] NEWSPIDER_MODULE = 'myProject.spiders' ROBOTSTXT_OBEY = False COOKIES_ENABLED = False DOWNLOAD_DELAY = 1.0#設置延遲,減輕伺服器壓力,從而降低被查封的風險 #設置下載中間件 DOWNLOADER_MIDDLEWARES = { 'myProject.middlewares.MyprojectDownloaderMiddleware': 543, } #設置item管道 ITEM_PIPELINES = { 'myProject.pipelines.MyprojectPipeline': 300, } #可供選擇的User-Agent,可通過網上查找更多,隨機切換,起到反爬蟲作用 USER_AGENTS = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ]
7、開啟爬蟲
scrapy crawl sun