selenium爬取拉勾网python职位信息
- 2019 年 11 月 20 日
- 筆記
直接上代码,主要爬取的是广州的python职位信息
from selenium import webdriver import time from lxml import etree import re from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import xlwt import csv class LagouSpider(object): driver_path = r"H:pythonchromedriver.exe" chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("service_args = ['–ignore - ssl - errors = true', '–ssl - protocol = TLSv1']") # Python2/3 chromeOptions.add_experimental_option('excludeSwitches', ['enable - automation']) # 设置代理 chromeOptions.add_argument("--proxy-server=http://47.100.7.167:8989 ") def __init__(self): self.driver = webdriver.Chrome(chrome_options=LagouSpider.chromeOptions,executable_path=LagouSpider.driver_path,) self.url = 'https://www.lagou.com/jobs/list_python/p-city_213?px=default#filterBox' self.positions = [] def run(self): self.driver.get(self.url) while True: source = self.driver.page_source self.parse_list_page(source) if re.search(r'action="next" class="pager_next pager_next_disabled"', self.driver.page_source): break self.next_page() # 点击进入下一页 self.driver.quit() # 将浏览器退出 self.write_to_csv() # 将获取的数据写入文件 def parse_list_page(self,source): html = etree.HTML(source) links = html.xpath("//a[@class='position_link']/@href") self.driver.execute_script("window.open()") # 开启新的标签页 self.driver.switch_to.window(self.driver.window_handles[1]) # 切换到新的标签页 for url in links: # 遍历职位的详情页 self.driver.get(url) # 打开职位的详情页 html = etree.HTML(self.driver.page_source) # 解析详情页 title = html.xpath('//h4[@class="company"]/text()')[0] job_request_span = html.xpath('//dd[@class ="job_request"]/h3/span/text()') salary = job_request_span[0] salary = re.sub(r"[s/]","",salary) city = job_request_span[1] city = re.sub(r"[s/]","",city) work_year = job_request_span[2] work_year = re.sub(r"[s/]", "", work_year) education = job_request_span[3] education = re.sub(r"[s/]", "", education) desc = ''.join(html.xpath('//dd[@class="job_bt"]//text()')).strip() company = html.xpath('//em[@class="fl-cn"]/text()')[0].strip() positon = { 'title': title, 'company': company, 'salary': salary, 'city': city, 'work_year': work_year, 'education': education, 'desc': desc, } self.positions.append(positon) print(positon) time.sleep(5) self.driver.close() # 关闭标签页 self.driver.switch_to.window(self.driver.window_handles[0]) # 切换页面 def next_page(self): # 找到下一页标签 element = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "pager_next"))) element.click() # 点击下一页标签 time.sleep(1) def write_to_csv(self): # 写入文件 header = ['title', 'company', 'desc', 'salary', 'work_year', 'education'] with open('positons.csv', 'w', newline='', encoding='utf-8') as fp: write = csv.DictWriter(fp, header) write.writeheader() write.writerows(self.positions) if __name__ == '__main__': spider = LagouSpider() spider.run()