python爬虫之验证码识别

2019 年 10 月 10 日
笔记

本文链接：https://blog.csdn.net/weixin_40313634/article/details/84574797

环境：

subline: https://download.sublimetext.com/Sublime Text Build 3176 x64 Setup.exe
python: https://www.python.org/ftp/python/3.7.0/python-3.7.0-amd64.exe
OCR识别库工具：https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-setup-3.05.01.exe
图形验证码：https://github.com/Python3WebSpider/CrackImageCode/archive/master.zip
滑动验证码：https://github.com/Python3WebSpider/CrackGeetest/archive/master.zip 平台：https://passport.cnblogs.com/user/signin 例子：https://www.cnblogs.com/moning/p/8318475.html

import tesserocr from PIL import Image

image = Image.open(‘code2.jpg’)

image = image.convert(‘L’) threshold = 127 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1)

image = image.point(table, ‘1’) image.show()

result = tesserocr.image_to_text(image) print(result)

import time from io import BytesIO from PIL import Image from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC

EMAIL = ‘[email protected]’ PASSWORD = ‘’ BORDER = 6 INIT_LEFT = 60

class CrackGeetest(): def init(self): self.url = ‘https://account.geetest.com/login’ self.browser = webdriver.Chrome() self.wait = WebDriverWait(self.browser, 20) self.email = EMAIL self.password = PASSWORD

def __del__(self):      self.browser.close()    def get_geetest_button(self):      """      获取初始验证按钮      :return:      """      button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))      return button    def get_position(self):      """      获取验证码位置      :return: 验证码位置元组      """      img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img')))      time.sleep(2)      location = img.location      size = img.size      top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[          'width']      return (top, bottom, left, right)    def get_screenshot(self):      """      获取网页截图      :return: 截图对象      """      screenshot = self.browser.get_screenshot_as_png()      screenshot = Image.open(BytesIO(screenshot))      return screenshot    def get_slider(self):      """      获取滑块      :return: 滑块对象      """      slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button')))      return slider    def get_geetest_image(self, name='captcha.png'):      """      获取验证码图片      :return: 图片对象      """      top, bottom, left, right = self.get_position()      print('验证码位置', top, bottom, left, right)      screenshot = self.get_screenshot()      captcha = screenshot.crop((left, top, right, bottom))      captcha.save(name)      return captcha    def open(self):      """      打开网页输入用户名密码      :return: None      """      self.browser.get(self.url)      email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))      password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))      email.send_keys(self.email)      password.send_keys(self.password)    def get_gap(self, image1, image2):      """      获取缺口偏移量      :param image1: 不带缺口图片      :param image2: 带缺口图片      :return:      """      left = 60      for i in range(left, image1.size[0]):          for j in range(image1.size[1]):              if not self.is_pixel_equal(image1, image2, i, j):                  left = i                  return left      return left    def is_pixel_equal(self, image1, image2, x, y):      """      判断两个像素是否相同      :param image1: 图片1      :param image2: 图片2      :param x: 位置x      :param y: 位置y      :return: 像素是否相同      """      # 取两个图片的像素点      pixel1 = image1.load()[x, y]      pixel2 = image2.load()[x, y]      threshold = 60      if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(              pixel1[2] - pixel2[2]) < threshold:          return True      else:          return False    def get_track(self, distance):      """      根据偏移量获取移动轨迹      :param distance: 偏移量      :return: 移动轨迹      """      # 移动轨迹      track = []      # 当前位移      current = 0      # 减速阈值      mid = distance * 4 / 5      # 计算间隔      t = 0.2      # 初速度      v = 0        while current < distance:          if current < mid:              # 加速度为正2              a = 2          else:              # 加速度为负3              a = -3          # 初速度v0          v0 = v          # 当前速度v = v0 + at          v = v0 + a * t          # 移动距离x = v0t + 1/2 * a * t^2          move = v0 * t + 1 / 2 * a * t * t          # 当前位移          current += move          # 加入轨迹          track.append(round(move))      return track    def move_to_gap(self, slider, track):      """      拖动滑块到缺口处      :param slider: 滑块      :param track: 轨迹      :return:      """      ActionChains(self.browser).click_and_hold(slider).perform()      for x in track:          ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform()      time.sleep(0.5)      ActionChains(self.browser).release().perform()    def login(self):      """      登录      :return: None      """      submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn')))      submit.click()      time.sleep(10)      print('登录成功')    def crack(self):      # 输入用户名密码      self.open()      # 点击验证按钮      button = self.get_geetest_button()      button.click()      # 获取验证码图片      image1 = self.get_geetest_image('captcha1.png')      # 点按呼出缺口      slider = self.get_slider()      slider.click()      # 获取带缺口的验证码图片      image2 = self.get_geetest_image('captcha2.png')      # 获取缺口位置      gap = self.get_gap(image1, image2)      print('缺口位置', gap)      # 减去缺口位移      gap -= BORDER      # 获取移动轨迹      track = self.get_track(gap)      print('滑动轨迹', track)      # 拖动滑块      self.move_to_gap(slider, track)        success = self.wait.until(          EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功'))      print(success)        # 失败后重试      if not success:          self.crack()      else:          self.login()

if name == ‘main’: crack = CrackGeetest() crack.crack()

python爬虫之验证码识别

环境：

VirMach 便宜 VPS

QNews

python爬虫之验证码识别

环境：

分享此文：

Related Posts

如何成为一个 AI 白痴？

【DB宝47】企业知识分享+团队协作神器之Confluence

抓取个人微博 之 Ajax 数据爬取

HTTP协议web开发知识点

VirMach 便宜 VPS

QNews

热门搜寻

抓取个人微博之 Ajax 数据爬取