詞頻統計小程序-WordCount.exe
- 2019 年 12 月 6 日
- 筆記
最近頂哥為了完成學歷提升學業中的小作業,做了一個詞頻統計的.exe小程序。因為當時做的時候網上的比較少,因此頂哥決定把自己拙略的作品發出來給需要的人提供一種思路,希望各位看官不要dis才好。最後附上源碼鏈接,感興趣的朋友可以繼續優化哦。
01
—
先看效果
雙擊運行,下拉框選擇源文件來源,支持本地和網絡資源,如圖:


本地源文件

網絡源文件
02
—
主要代碼
1.pom文件
<dependencies> <!-- 分詞器 --> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> <!-- 單元測試 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> <version>2.18.1</version> <configuration> <skipTests>true</skipTests> </configuration> </plugin> <!--打包插件 --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>2.4.1</version> <configuration> <!-- get all project dependencies --> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> <!-- MainClass in mainfest make a executable jar --> <archive> <manifest> <addClasspath>true</addClasspath> <mainClass>cn.dintalk.service.WordCount</mainClass> </manifest> </archive> </configuration> <executions> <execution> <id>make-assembly</id> <!-- bind to the packaging phase --> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build>
2.WebUtils
/** * @author Mr.song * @date 2019/10/13 9:26 */ public class WebUtils { /** * 根據url和參數發送get請求 * * @param url * @param param * @return 返回網頁內容 */ public static String sendGet(String url, String param) { String result = ""; if (param != null) { url = url + "?" + param; } try { URL realUrl = new URL(url); // 打開和URL之間的連接 HttpURLConnection conn = getHttpURLConnection(realUrl); result = getResponse(conn); } catch (Exception e) { e.printStackTrace(); } return result; } //根據url 獲取連接 private static HttpURLConnection getHttpURLConnection(URL realUrl) { StringBuilder sb = new StringBuilder(); sb.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64)"); sb.append(" AppleWrbKit/537.36(KHTML, like Gecko)"); sb.append(" Chrome/72.0.3626.119 Safari/537.36"); HttpURLConnection conn = null; try { // 打開和URL之間的連接 conn = (HttpURLConnection) realUrl.openConnection(); // 設置通用的請求屬性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", sb.toString()); } catch (IOException e) { e.printStackTrace(); } return conn; } // 根據url連接獲取響應 private static String getResponse(HttpURLConnection conn) { // 讀取URL的響應 String result = ""; try (InputStream is = conn.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "utf-8"); BufferedReader in = new BufferedReader(isr)) { String line; while ((line = in.readLine()) != null) { result += "n" + line; } } catch (Exception e) { System.out.println("Err:getResponse()"); e.printStackTrace(); } finally { conn.disconnect(); } // System.out.println("getResponse():" + result.length()); return result; } /** * 解析網頁為文本 * * @param html * @return */ public static String parseHtmlToText(String html) { Document document = Jsoup.parse(html); return document.text(); } }
3.IKSUtils
/** * @author Mr.song * @date 2019/10/10 21:12 */ public class IKSUtils { /** * 對文本進行分詞 * @param text * @return * @throws Exception */ public static List<String> getStringList(String text) throws Exception{ //獨立Lucene實現 StringReader re = new StringReader(text); IKSegmenter ik = new IKSegmenter(re, true); Lexeme lex; List<String> s = new ArrayList<>(); while ((lex = ik.next()) != null) { s.add(lex.getLexemeText()); } return s; } /** * 統計詞頻 * @param wordList * @return */ public static Map<String,Integer> wordCount(List<String> wordList){ if (wordList == null) return null; Map<String,Integer> result = new HashMap<>(); for (String s : wordList) { Integer count = result.get(s); if (count == null){ result.put(s,1); }else { result.put(s,++count); } } //按照次數排序 result = result .entrySet() .stream() .sorted(Collections.reverseOrder(Map.Entry.comparingByValue())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); return result; } }
03
—
相關鏈接
源碼地址:https://github.com/MrSonghui/wordCount
打包.exe :https://www.cnblogs.com/xiaoMzjm/p/3879766.html