Scrapy框架之爬取拉勾网
- 2019 年 10 月 6 日
- 笔记
Scrapy框架之爬取拉勾网
0.前言
1.建立项目
2.spider+selenium
3.数据存储
4.作者的话
0.前言
最近有一段时间没爬虫了,现在来学习一下爬虫框架及数据库操作! 看完这篇文章,你可以学会如下操作!
- scrapy框架
- BeautifulSoup
- lxml
- selenium
- pyecharts
- pymysql
1.建立项目
scrapy startproject CQJob scrapy genspider cqjobs
2.spider+selenium
start_urls配置
start_urls = ['https://www.lagou.com/jobs/list_?px=default&hy=%E5%8C%BB%E7%96%97%E5%81%A5%E5%BA%B7&city=%E9%87%8D%E5%BA%86'] # 配置url
chromedriver配置
# 开启chromedriver browser = webdriver.Chrome() browser.get(self.start_urls[0]) browser.implicitly_wait(10) # # 写入源html # f = open('./wr.txt', 'w', encoding='utf8') # raw_html = browser.page_source # f.write(raw_html) # f.close() ....... BeautifulSoup及xpath使用对多页面处理代码 ............ browser.close() # 关闭浏览器
BeautifulSoup及xpath使用对多页面处理
# 使用BeautifulSoup定位 ''' pager_next pager_next pager_next_disabled ''' for i in range(11): selector = etree.HTML(browser.page_source) # 获取源码 soup = BeautifulSoup(browser.page_source, features='lxml') # a = soup.find_all("div", class_="pager_container") span = soup.find("div", attrs={"class": "pager_container"}).find("span", attrs={"action": "next"}) # f = open('./new.txt', 'w', encoding='utf8') # f.write(str(span)) # f.close() classSpan = span['class'] print('----------------------------------------------') print(classSpan) # 输出内容为 -> ['pager_next', 'pager_next_disabled'] next_flag = classSpan[1] self.parsedata(selector) if next_flag == "pager_next_disabled": print("已经爬到最后一页,爬虫结束") break else: print("还有下一页,爬虫继续") # 这里一定要注意不能直接copy源代码的xpath,因为每个页面元素标签有可能不一样! browser.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[@action="next"]').click() # 点击下一页 time.sleep(5) print('第{}页抓取完毕'.format(i + 1))
数据定义及封装
items.py
class CqjobItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() # 职位 company = scrapy.Field() # 公司名称 location = scrapy.Field() # 公司地点 welfare = scrapy.Field() # 福利 salaryMin = scrapy.Field() # 工资下限 salaryMax = scrapy.Field() # 工资上限 salaryMid = scrapy.Field() # 平均工资 experience = scrapy.Field() # 工作经验 education = scrapy.Field() # 教育程度 companyType = scrapy.Field() # 公司类型 companyLevel = scrapy.Field() # 公司级别 companySize = scrapy.Field() # 公司人数规模
cqjobs.py(spiders文件)
name_list = [] location_list = [] company_list = [] welfare_list = [] salaryMin_list = [] salaryMax_list = [] salaryMid_list = [] experience_list = [] education_list = [] companyType_list = [] companyLevel_list = [] companySize_list = [] items = CqjobItem() items['name'] = self.name_list items['company'] = self.company_list items['location'] = self.location_list items['welfare'] = self.welfare_list items['salaryMin'] = self.salaryMin_list items['salaryMax'] = self.salaryMax_list items['salaryMid'] = self.salaryMid_list items['experience'] = self.experience_list items['education'] = self.education_list items['companyType'] = self.companyType_list items['companyLevel'] = self.companyLevel_list items['companySize'] = self.companySize_list print(items)
xpath爬取特定数据
def parsedata(self, selector): sel_list = selector.xpath('//*[@id="s_position_list"]/ul/li') for item in sel_list: name = item.xpath('div[1]/div[1]/div[1]/a/h3/text()')[0] self.name_list.append(name) location = item.xpath('div[1]/div[1]/div[1]/a/span/em/text()')[0] self.location_list.append(location) company = item.xpath('div[1]/div[2]/div[1]/a/text()')[0] self.company_list.append(company) welfare = item.xpath('div[2]/div[2]/text()')[0] self.welfare_list.append(welfare) salaryList = item.xpath('div[1]/div[1]/div[2]/div/span/text()')[0].strip().split("-") # print(salaryList) # [10k-15k] salaryMin = salaryList[0][:len(salaryList[0]) - 1] # 10 去除k,只留数字 self.salaryMin_list.append(salaryMin) salaryMax = salaryList[1][:len(salaryList[1]) - 1] # 15 self.salaryMax_list.append(salaryMax) salaryMid = (int(salaryMin) + int(salaryMax)) / 2 self.salaryMid_list.append(salaryMid) educationArray = item.xpath('div[1]/div[1]/div[2]/div//text()')[3].strip().split("/") # print(educationArray) experience = educationArray[0].strip() self.experience_list.append(experience) education = educationArray[1].strip() self.education_list.append(education) # conmpanyMsgArray = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/") conmpanyMsgList = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/") companyType = conmpanyMsgList[0].strip() self.companyType_list.append(companyType) companyLevel = conmpanyMsgList[1].strip() self.companyLevel_list.append(companyLevel) companySize = conmpanyMsgList[2].strip() self.companySize_list.append(companySize)
数据分析
# TreeMap数据格式 def getTreeData(self, treedata): treemap_data = [] for key in treedata: if key != '重庆': treemap_data.append({"value": treedata[key], "name": key}) return treemap_data # 位置数据分析 def LocAnalysis(self, items): loca_data = items['location'] list_data = set(loca_data) treemap_data = {} for item in list_data: treemap_data[item] = loca_data.count(item) print(treemap_data) data = self.getTreeData(treemap_data) # 转换为相应的TreeMap数据 print(data) treemap = TreeMap("重庆医疗健康位置分布图", width=1200, height=600, title_pos="center") treemap.add("位置数据", data, is_label_show=True, label_pos='inside', label_text_color='#000', is_legend_show=False) treemap.render() def educaAnalysis(self, items): educa_data = items['education'] educalist_data = set(educa_data) print(educalist_data) edupie_list = [] edupie_data = [] for item in educalist_data: edupie_list.append(item) edupie_data.append(educa_data.count(item)) print(edupie_list) print(edupie_data) pie = Pie("重庆医疗健康招聘学历要求", title_pos='center') pie.add( "学历", edupie_list, edupie_data, center=[50, 50], is_random=True, radius=[30, 75], rosetype="area", is_legend_show=False, is_label_show=True, ) pie.render() # 位置数据分析 def CompanyAnalysis(self, items): loca_data = items['company'] list_data = set(loca_data) treemap_data = {} for item in list_data: treemap_data[item] = loca_data.count(item) print(treemap_data) data = self.getTreeData(treemap_data) # 转换为相应的TreeMap数据 print(data) treemap = TreeMap("重庆医疗相关公司分布图", width=1500, height=900, title_pos="center") treemap.add("公司数据", data, is_label_show=True, label_pos='inside', label_text_color='#000', is_legend_show=False) treemap.render() # 工资数据分析 def SalaryAnalysis(self, items): axis_data = items['name'] print(axis_data) ayis_data = items['salaryMid'] print(ayis_data) bar = Bar("重庆医疗职位平均工资图", width=1500, height=450, title_pos="center") bar.add("工资数据", axis_data, ayis_data, mark_line=["average"], mark_point=["max", "min"], legend_pos='right', is_xaxis_show=False) bar.render() def SalaryTreeAnalysis(self, items): salary_name = items['name'] salary_data = items['salaryMid'] salary_set = set(salary_name) treemap_data = {} for item in salary_set: treemap_data[item] = salary_data print(treemap_data) data = self.getTreeData(treemap_data) # 转换为相应的TreeMap数据 print(data) treemap = TreeMap("重庆医疗职位工资分布图", width=1500, height=900, title_pos="center") treemap.add("职位数据", data, is_label_show=True, label_pos='inside', label_text_color='#000', is_legend_show=False) treemap.render()
数据分析调用
self.LocAnalysis(items) self.educaAnalysis(items) self.CompanyAnalysis(items) self.SalaryAnalysis(items) self.SalaryTreeAnalysis(items)





3.数据存储
settings.py
# Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'CQJob.pipelines.CqjobPipeline': 300, }
pipelines.py
import pymysql class CqjobPipeline(object): def process_item(self, item, spider): ''' 将爬取的信息保存到mysql ''' connection = pymysql.connect(host='localhost', user='root', password='xxxx', db='scrapydb', charset='utf8mb4') try: with connection.cursor() as cursor: for i in range(len(item['name'])): sql = "insert into `cqjobs`(`name`,`company`,`location`,`welfare`,`salaryMin`,`salaryMax`,`salaryMid`,`experience`,`education`,`companyType`,`companyLevel`,`companySize`)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql, ( item['name'][i], item['company'][i], item['location'][i], item['welfare'][i], item['salaryMin'][i], item['salaryMax'][i], item['salaryMid'][i], item['experience'][i], item['education'][i], item['companyType'][i], item['companyLevel'][i], item['companySize'][i])) connection.commit() # except pymysql.err.IntegrityError as e: # print('重复数据,勿再次插入!') finally: connection.close() return item

4.作者的话
最后,您如果觉得本公众号对您有帮助,欢迎您多多支持,转发,谢谢! 更多内容,请关注本公众号爬虫系列!