selenium爬取拉勾網python職位信息

  • 2019 年 11 月 20 日
  • 筆記

直接上代碼,主要爬取的是廣州的python職位信息

from selenium import webdriver  import time  from lxml import etree  import re  from selenium.webdriver.support.ui import WebDriverWait  from selenium.webdriver.support import expected_conditions as EC  from selenium.webdriver.common.by import By  import xlwt  import  csv    class LagouSpider(object):      driver_path = r"H:pythonchromedriver.exe"      chromeOptions = webdriver.ChromeOptions()      chromeOptions.add_argument("service_args = ['–ignore - ssl - errors = true', '–ssl - protocol = TLSv1']")  # Python2/3      chromeOptions.add_experimental_option('excludeSwitches', ['enable - automation'])      # 設置代理      chromeOptions.add_argument("--proxy-server=http://47.100.7.167:8989 ")      def __init__(self):          self.driver = webdriver.Chrome(chrome_options=LagouSpider.chromeOptions,executable_path=LagouSpider.driver_path,)          self.url = 'https://www.lagou.com/jobs/list_python/p-city_213?px=default#filterBox'          self.positions = []        def run(self):          self.driver.get(self.url)          while True:              source = self.driver.page_source              self.parse_list_page(source)              if re.search(r'action="next" class="pager_next pager_next_disabled"', self.driver.page_source):                  break              self.next_page()  # 點擊進入下一頁          self.driver.quit()  # 將瀏覽器退出          self.write_to_csv()  # 將獲取的數據寫入文件        def parse_list_page(self,source):          html = etree.HTML(source)          links = html.xpath("//a[@class='position_link']/@href")          self.driver.execute_script("window.open()")  # 開啟新的標籤頁          self.driver.switch_to.window(self.driver.window_handles[1])  # 切換到新的標籤頁          for url in links:  # 遍歷職位的詳情頁              self.driver.get(url)  # 打開職位的詳情頁              html = etree.HTML(self.driver.page_source)  # 解析詳情頁              title = html.xpath('//h4[@class="company"]/text()')[0]              job_request_span = html.xpath('//dd[@class ="job_request"]/h3/span/text()')                salary = job_request_span[0]              salary = re.sub(r"[s/]","",salary)              city = job_request_span[1]              city = re.sub(r"[s/]","",city)              work_year = job_request_span[2]              work_year = re.sub(r"[s/]", "", work_year)              education = job_request_span[3]              education = re.sub(r"[s/]", "", education)              desc = ''.join(html.xpath('//dd[@class="job_bt"]//text()')).strip()              company = html.xpath('//em[@class="fl-cn"]/text()')[0].strip()              positon = {                  'title': title,                  'company': company,                  'salary': salary,                  'city': city,                  'work_year': work_year,                  'education': education,                  'desc': desc,              }              self.positions.append(positon)              print(positon)              time.sleep(5)          self.driver.close()  # 關閉標籤頁          self.driver.switch_to.window(self.driver.window_handles[0])  # 切換頁面          def next_page(self):          # 找到下一頁標籤          element = WebDriverWait(self.driver, 10).until(              EC.presence_of_element_located((By.CLASS_NAME, "pager_next")))          element.click()  # 點擊下一頁標籤          time.sleep(1)        def write_to_csv(self):  # 寫入文件          header = ['title', 'company', 'desc', 'salary', 'work_year', 'education']          with open('positons.csv', 'w', newline='', encoding='utf-8') as fp:              write = csv.DictWriter(fp, header)              write.writeheader()              write.writerows(self.positions)    if __name__ == '__main__':      spider = LagouSpider()      spider.run()