Selenium+Tesseract-OCR智能識別驗證碼爬取網頁數據

1.項目需求描述

通過訂單號獲取某系統內訂單的詳細數據,不需要賬號密碼的登錄驗證,但有圖片驗證碼的動態識別,將獲取到的數據存到數據庫。

2.整體思路

  1.通過Selenium技術,無窗口模式打開瀏覽器

  2.在輸入框中動態輸入訂單號

  3.將圖片驗證碼截圖保存到本地

  4.通過Tesseract-OCR技術去本地識別驗證碼轉化為文字

  5.將獲取的驗證碼輸入輸入框

  6.點擊查詢獲取列表數據

 3.功能實現

1.下載並安裝Google瀏覽器,安裝Google驅動chromedriver.exe,獲取安裝路徑,配置在項目中

2.使用Selenium進行瀏覽器操作

 1 System.setProperty(瀏覽器驅動, 瀏覽器驅動安裝位置);
 2 ChromeOptions options = new ChromeOptions();
 3 options.addArguments("--headless");                            // 無窗口模式
 4 options.addArguments("--disable-infobars");                    // 禁言消息條
 5 options.addArguments("--disable-extensions");                  // 禁用插件
 6 options.addArguments("--disable-gpu");                         // 禁用GPU
 7 options.addArguments("--no-sandbox");                          // 禁用沙盒模式
 8 options.addArguments("--disable-dev-shm-usage");
 9 options.addArguments("--hide-scrollbars");                     // 隱藏滾動條
10 
11 WebDriver driver = new ChromeDriver(options);
12 driver.get(爬取網站URL);
13 driver.manage().window().setSize(new Dimension(450, 260));     // 設置遊覽器打開後調整大小
14 try {
15     // 保存IMG圖片到本地
16     saveImgToLocal(driver);
17     Thread.sleep(2000);
18     // OCR智能識別驗證碼
19     String codeByOCR = getCodeByOCR();
20     if (codeByOCR != null) {
21         try {
22             WebElement input1 = driver.findElement(By.id(TEXTBOX1));
23             input1.sendKeys(code);
24             WebElement input2 = driver.findElement(By.id(TEXTBOX2));
25             input2.sendKeys(codeByOCR);
26             // 獲取table數據
27             WebElement addButton = driver.findElement(By.id(SELECT_BUTTON));
28             addButton.click();
29             List<WebElement> tRCollection = driver.findElement(By.id(TABLE_ID)).findElements(By.tagName("tr"));
30             for (int t = 1; t < tRCollection.size(); t++) {
31                 List<WebElement> tDCollection = tRCollection.get(t).findElements(By.tagName("td"));
32                 VipLogisticsMinHangDetailVo minHangDetailVo = new VipLogisticsMinHangDetailVo();
33                 minHangDetailVo.setLogistics_number(code);
34                 for (int i = 0; i < tDCollection.size(); i++) {
35                     String text = tDCollection.get(i).getText();
36                     switch (i) {
37                         case 0:
38                             minHangDetailVo.setTime(text);
39                         case 1:
40                             minHangDetailVo.setOutlet(text);
41                         case 2:
42                             minHangDetailVo.setOrganization(text);
43                         case 3:
44                             minHangDetailVo.setEvent(text);
45                         case 4:
46                             minHangDetailVo.setDetail(text);
47                     }
48                 }
49                 list.add(minHangDetailVo);
50             }
51             log.info("驗證碼識別成功!");
52         } catch (Exception e) {
53             if (e.toString().contains("錯誤提示:驗證碼錯誤或已過期!")) {
54                 log.error("驗證碼識別錯誤!" + e.toString());                   
55             } else if (e.toString().contains("錯誤提示:請輸入驗證碼!")) {
56                 log.error("未輸入驗證碼!:" + e.toString());                       
57             } else {
58                 log.error("其他異常:" + e.toString());
59             }
60         }
61     }
62     driver.quit();
63 } catch (Exception e) {
64     e.printStackTrace();
65 }

View Code

3.將圖片驗證碼截圖保存到本地(截屏法)

 1 private void saveImgToLocal(WebDriver driver) {
 2     WebElement element = driver.findElement(By.id(img元素ID));
 3     //創建全屏截圖
 4     WrapsDriver wrapsDriver = (WrapsDriver) element;
 5     File screen = ((TakesScreenshot) wrapsDriver.getWrappedDriver()).getScreenshotAs(OutputType.FILE);
 6     try {
 7         BufferedImage image = ImageIO.read(screen);
 8         //創建一個矩形使用上面的高度,和寬度
 9         Point p = element.getLocation();
10         //元素坐標
11         BufferedImage img = image.getSubimage(p.getX(), p.getY(), element.getSize().getWidth(), element.getSize().getHeight());
12         ImageIO.write(img, "png", screen);
13 
14         FileUtils.copyFile(screen, new File(保存本地地址 + "imgname.png"));
15     } catch (IOException e) {
16         e.printStackTrace();
17     }
18 }

View Code

4.將圖片驗證碼保存到本地(鼠標法)

 1 private static void saveImgToLocal1(WebDriver driver) {
 2     Actions action = new Actions(driver);
 3     action.contextClick(driver.findElement(By.id(img元素ID))).build().perform();
 4     try {
 5         Robot robot = new Robot();
 6         Thread.sleep(1000);
 7 
 8         robot.keyPress(KeyEvent.VK_DOWN);
 9         Thread.sleep(1000);
10 
11         robot.keyPress(KeyEvent.VK_DOWN);
12         Thread.sleep(1000);
13 
14         robot.keyPress(KeyEvent.VK_ENTER);
15         Thread.sleep(1000);
16         //釋放向下鍵,不然在此之前的條目將起作用
17         robot.keyRelease(KeyEvent.VK_DOWN);
18         Thread.sleep(1000);
19         //運行保存
20         Runtime.getRuntime().exec(SAVE_IMG_EXE);
21         Thread.sleep(10000);
22     } catch (Exception e) {
23         e.printStackTrace();
24     }
25 }

View Code

 1 private static void saveImgToLocal1(WebDriver driver) {
 2     Actions action = new Actions(driver);
 3     action.contextClick(driver.findElement(By.id(img元素ID))).build().perform();
 4     try {
 5         Robot robot = new Robot();
 6         Thread.sleep(1000);
 7 
 8         robot.keyPress(KeyEvent.VK_DOWN);
 9         Thread.sleep(1000);
10 
11         robot.keyPress(KeyEvent.VK_DOWN);
12         Thread.sleep(1000);
13 
14         robot.keyPress(KeyEvent.VK_ENTER);
15         Thread.sleep(1000);
16         //釋放向下鍵,不然在此之前的條目將起作用
17         robot.keyRelease(KeyEvent.VK_DOWN);
18         Thread.sleep(1000);
19         //運行保存
20         Runtime.getRuntime().exec(SAVE_IMG_EXE);
21         Thread.sleep(10000);
22     } catch (Exception e) {
23         e.printStackTrace();
24     }
25 }

5.對本地驗證碼進行OCR識別

 1 private String getCodeByOCR() {
 2     String result = null;
 3     File file = new File(本地圖片地址);
 4     if (!file.exists()) {
 5         if (systemFalg != 1) {
 6             file.setWritable(true, false);
 7         }
 8         file.mkdirs();
 9     }
10     File imageFile = new File(本地圖片地址 + "imgname.png");
11     if (imageFile.exists()) {
12         ITesseract instance = new Tesseract();
13         instance.setDatapath(tessdata存放地址);
14         try {
15             String doOCR = instance.doOCR(imageFile);
16             result = replaceBlank(doOCR);
17             log.info("解析的驗證碼為:{}", result != null ? result : "為空!");
18         } catch (Exception e) {
19             e.printStackTrace();
20             log.error("解析驗證碼異常!");
21         }
22     } else {
23         log.error("解析驗證碼的文件不存在!");
24     }
25     return result;
26 }

View Code

 

綜上,該網頁的數據就可以獲取了。