httpclient爬蟲爬取漢字拼音等信息
- 2019 年 10 月 4 日
- 筆記
下面是使用httpclient爬蟲爬取某個網站的漢字相關信息的實踐代碼,中間遇到了一些字符格式的問題。之前被同事見過用html解析類來抓取頁面信息,而不是像我現在用正則,經常嘗試,效果並不好,畢竟頁面放爬蟲還是非常好做的。在本次實踐中,就遇到了相關的難點,所以還是才去了正則提取的方式。分享代碼,供大家參考。關鍵信息並未隱去。
public static void main(String[] args) throws SQLException { DEFAULT_CHARSET = GB2312; List<String> list = WriteRead.readTxtFileByLine(LONG_Path + "word.log"); list.forEach(py -> { getPYAndWord(py); }); testOver(); } public static void getPYAndWord(String py) { output(py); String url = "http://zd.diyifanwen.com/zidian/py/" + py + ".htm"; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); // output(response); String content = response.getString("content"); String all = new String(content.getBytes(UTF_8), UTF_8); List<String> regexAll = new ArrayList<>(); List<String> alllist = regexAll(all, "http://zd.d.*?>[\u4e00-\u9FFF]<"); output(alllist.size()); alllist.forEach(line -> { String murl = regexAll(line, "http://zd.diyifanwen.com/zidian/\w/\d+.htm").get(0); String mword = regexAll(line, ">[\u4e00-\u9fa5]<").get(0); regexAll.add(mword); output(murl, mword); String sql = "INSERT INTO chinese_dictionary_word (word,url) VALUES ("%s","%s");"; sql = String.format(sql, mword.replaceAll("<|>", EMPTY), murl); output(sql); MySqlTest.sendWork(sql); }); String str = regexAll.toString().replaceAll("<|>|\[|\]", EMPTY); String sql = "INSERT INTO chinese_dictionary_py_word (py,words) VALUES ("%s","%s");"; sql = String.format(sql, py, str); output(sql); MySqlTest.sendWork(sql); sleep(2); } /**獲取拼音列表 * @return */ public static String getPY() { String url = "http://zd.diyifanwen.com/zidian/py/"; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); // output(response); String content = response.getString("content"); byte[] bytes = content.getBytes(UTF_8); String all = new String(bytes, UTF_8); Log.log("content", all); return all; } /**獲取所有首字母和拼音 * @param all */ public static void getAllPY(String all) { List<String> list = regexAll(all, "<dt class="pyTitle">拼音首字母\w+</dt>" + LINE + ".+/dd>"); list.forEach(s -> { int num = s.indexOf("拼音首字母"); String first = s.substring(num + 5, num + 6); List<String> list1 = regexAll(s, "http://zd.diyifanwen.com/zidian/py/\w+.htm"); list1.forEach(str -> { int one = str.indexOf("/py/"); int two = str.lastIndexOf("."); String second = str.substring(one + 4, two); String sql = "INSERT INTO chinese_dictionary_py (first_word,all_word) VALUES ("%s","%s");"; String sqlEnd = String.format(sql, first, second); MySqlTest.sendWork(sqlEnd); }); }); } /**檢查拼音是否全部獲取到 * @param all */ public static void checkPY(String all) { List<String> list = regexAll(all, "zidian/py/\w+.htm"); list.forEach(str -> { int one = str.indexOf("/py/"); int two = str.lastIndexOf("."); String second = str.substring(one + 4, two); output(second); String sql = "SELECT * FROM chinese_dictionary_py WHERE all_word = "%s";"; String sq = String.format(sql, second); ResultSet resultSet = MySqlTest.excuteQuerySql(sq); try { if (!resultSet.next()) output(sq); } catch (SQLException e) { e.printStackTrace(); } }); } /**從數據庫中查找當前獲取的拼音並存儲到文件中 * @throws SQLException */ public static void getAllPY() throws SQLException { List<String> word = new ArrayList<>(); ResultSet resultSet = MySqlTest.excuteQuerySql("SELECT all_word FROM chinese_dictionary_py;"); while (resultSet.next()) { String string = resultSet.getString(1); word.add(string); } Save.saveStringList(word, "word"); }
結果如圖:


對於漢字具體的釋義內容並未爬取,連接進行了保存。
技術類文章精選
- java一行代碼打印心形
- Linux性能監控軟件netdata中文漢化版
- 接口測試代碼覆蓋率(jacoco)方案分享
- 性能測試框架
- 如何在Linux命令行界面愉快進行性能測試
- 圖解HTTP腦圖
- 如何測試概率型業務接口
- httpclient處理多用戶同時在線
- 將swagger文檔自動變成測試代碼
- 五行代碼構建靜態博客
- httpclient如何處理302重定向
- 基於java的直線型接口測試框架初探
- Tcloud 雲測平台–集大成者
非技術文章精選
- 為什麼選擇軟件測試作為職業道路?
- 成為傑出Java開發人員的10個步驟
- 寫給所有人的編程思維
- 自動化測試的障礙
- 自動化測試的問題所在
- 測試之《代碼不朽》腦圖