Selenium+Tesseract-OCR智慧識別驗證碼爬取網頁數據
1.項目需求描述
通過訂單號獲取某系統內訂單的詳細數據,不需要帳號密碼的登錄驗證,但有圖片驗證碼的動態識別,將獲取到的數據存到資料庫。
2.整體思路
1.通過Selenium技術,無窗口模式打開瀏覽器
2.在輸入框中動態輸入訂單號
3.將圖片驗證碼截圖保存到本地
4.通過Tesseract-OCR技術去本地識別驗證碼轉化為文字
5.將獲取的驗證碼輸入輸入框
6.點擊查詢獲取列表數據
3.功能實現
1.下載並安裝Google瀏覽器,安裝Google驅動chromedriver.exe,獲取安裝路徑,配置在項目中
2.使用Selenium進行瀏覽器操作


1 System.setProperty(瀏覽器驅動, 瀏覽器驅動安裝位置); 2 ChromeOptions options = new ChromeOptions(); 3 options.addArguments("--headless"); // 無窗口模式 4 options.addArguments("--disable-infobars"); // 禁言消息條 5 options.addArguments("--disable-extensions"); // 禁用插件 6 options.addArguments("--disable-gpu"); // 禁用GPU 7 options.addArguments("--no-sandbox"); // 禁用沙盒模式 8 options.addArguments("--disable-dev-shm-usage"); 9 options.addArguments("--hide-scrollbars"); // 隱藏滾動條 10 11 WebDriver driver = new ChromeDriver(options); 12 driver.get(爬取網站URL); 13 driver.manage().window().setSize(new Dimension(450, 260)); // 設置遊覽器打開後調整大小 14 try { 15 // 保存IMG圖片到本地 16 saveImgToLocal(driver); 17 Thread.sleep(2000); 18 // OCR智慧識別驗證碼 19 String codeByOCR = getCodeByOCR(); 20 if (codeByOCR != null) { 21 try { 22 WebElement input1 = driver.findElement(By.id(TEXTBOX1)); 23 input1.sendKeys(code); 24 WebElement input2 = driver.findElement(By.id(TEXTBOX2)); 25 input2.sendKeys(codeByOCR); 26 // 獲取table數據 27 WebElement addButton = driver.findElement(By.id(SELECT_BUTTON)); 28 addButton.click(); 29 List<WebElement> tRCollection = driver.findElement(By.id(TABLE_ID)).findElements(By.tagName("tr")); 30 for (int t = 1; t < tRCollection.size(); t++) { 31 List<WebElement> tDCollection = tRCollection.get(t).findElements(By.tagName("td")); 32 VipLogisticsMinHangDetailVo minHangDetailVo = new VipLogisticsMinHangDetailVo(); 33 minHangDetailVo.setLogistics_number(code); 34 for (int i = 0; i < tDCollection.size(); i++) { 35 String text = tDCollection.get(i).getText(); 36 switch (i) { 37 case 0: 38 minHangDetailVo.setTime(text); 39 case 1: 40 minHangDetailVo.setOutlet(text); 41 case 2: 42 minHangDetailVo.setOrganization(text); 43 case 3: 44 minHangDetailVo.setEvent(text); 45 case 4: 46 minHangDetailVo.setDetail(text); 47 } 48 } 49 list.add(minHangDetailVo); 50 } 51 log.info("驗證碼識別成功!"); 52 } catch (Exception e) { 53 if (e.toString().contains("錯誤提示:驗證碼錯誤或已過期!")) { 54 log.error("驗證碼識別錯誤!" + e.toString()); 55 } else if (e.toString().contains("錯誤提示:請輸入驗證碼!")) { 56 log.error("未輸入驗證碼!:" + e.toString()); 57 } else { 58 log.error("其他異常:" + e.toString()); 59 } 60 } 61 } 62 driver.quit(); 63 } catch (Exception e) { 64 e.printStackTrace(); 65 }
View Code
3.將圖片驗證碼截圖保存到本地(截屏法)


1 private void saveImgToLocal(WebDriver driver) { 2 WebElement element = driver.findElement(By.id(img元素ID)); 3 //創建全螢幕截圖 4 WrapsDriver wrapsDriver = (WrapsDriver) element; 5 File screen = ((TakesScreenshot) wrapsDriver.getWrappedDriver()).getScreenshotAs(OutputType.FILE); 6 try { 7 BufferedImage image = ImageIO.read(screen); 8 //創建一個矩形使用上面的高度,和寬度 9 Point p = element.getLocation(); 10 //元素坐標 11 BufferedImage img = image.getSubimage(p.getX(), p.getY(), element.getSize().getWidth(), element.getSize().getHeight()); 12 ImageIO.write(img, "png", screen); 13 14 FileUtils.copyFile(screen, new File(保存本地地址 + "imgname.png")); 15 } catch (IOException e) { 16 e.printStackTrace(); 17 } 18 }
View Code
4.將圖片驗證碼保存到本地(滑鼠法)


1 private static void saveImgToLocal1(WebDriver driver) { 2 Actions action = new Actions(driver); 3 action.contextClick(driver.findElement(By.id(img元素ID))).build().perform(); 4 try { 5 Robot robot = new Robot(); 6 Thread.sleep(1000); 7 8 robot.keyPress(KeyEvent.VK_DOWN); 9 Thread.sleep(1000); 10 11 robot.keyPress(KeyEvent.VK_DOWN); 12 Thread.sleep(1000); 13 14 robot.keyPress(KeyEvent.VK_ENTER); 15 Thread.sleep(1000); 16 //釋放向下鍵,不然在此之前的條目將起作用 17 robot.keyRelease(KeyEvent.VK_DOWN); 18 Thread.sleep(1000); 19 //運行保存 20 Runtime.getRuntime().exec(SAVE_IMG_EXE); 21 Thread.sleep(10000); 22 } catch (Exception e) { 23 e.printStackTrace(); 24 } 25 }
View Code

1 private static void saveImgToLocal1(WebDriver driver) { 2 Actions action = new Actions(driver); 3 action.contextClick(driver.findElement(By.id(img元素ID))).build().perform(); 4 try { 5 Robot robot = new Robot(); 6 Thread.sleep(1000); 7 8 robot.keyPress(KeyEvent.VK_DOWN); 9 Thread.sleep(1000); 10 11 robot.keyPress(KeyEvent.VK_DOWN); 12 Thread.sleep(1000); 13 14 robot.keyPress(KeyEvent.VK_ENTER); 15 Thread.sleep(1000); 16 //釋放向下鍵,不然在此之前的條目將起作用 17 robot.keyRelease(KeyEvent.VK_DOWN); 18 Thread.sleep(1000); 19 //運行保存 20 Runtime.getRuntime().exec(SAVE_IMG_EXE); 21 Thread.sleep(10000); 22 } catch (Exception e) { 23 e.printStackTrace(); 24 } 25 }
5.對本地驗證碼進行OCR識別


1 private String getCodeByOCR() { 2 String result = null; 3 File file = new File(本地圖片地址); 4 if (!file.exists()) { 5 if (systemFalg != 1) { 6 file.setWritable(true, false); 7 } 8 file.mkdirs(); 9 } 10 File imageFile = new File(本地圖片地址 + "imgname.png"); 11 if (imageFile.exists()) { 12 ITesseract instance = new Tesseract(); 13 instance.setDatapath(tessdata存放地址); 14 try { 15 String doOCR = instance.doOCR(imageFile); 16 result = replaceBlank(doOCR); 17 log.info("解析的驗證碼為:{}", result != null ? result : "為空!"); 18 } catch (Exception e) { 19 e.printStackTrace(); 20 log.error("解析驗證碼異常!"); 21 } 22 } else { 23 log.error("解析驗證碼的文件不存在!"); 24 } 25 return result; 26 }
View Code
綜上,該網頁的數據就可以獲取了。