如何高效的完成中文分词?

  • 2019 年 10 月 8 日
  • 笔记

在说分词之前,笔者先来介绍下何为分词:分词就是将连续的字序列按照一定的规范重新组合成词序列的过程。英文中,单词之间是以空格作为自然分界符的,但是中文的分词就复杂多了,要涉及一些算法,对于初学者来说,还是有很多难度的。这里笔者只介绍一种最简单的方式,有兴趣的朋友可以看下,直接上代码,python实现方式

# -*- coding: utf-8 -*-  # flake8: noqa  __author__ = 'wukong'    import urllib  from urllib import urlencode    #配置您申请的appKey和openId  app_key="***"  open_id="***"    """  request_url 请求地址  params 请求参数  method 请求方法  """  def request_content(request_url,params,method):      params = urlencode(params)        if method and method.lower() =="get":          f = urllib.urlopen("%s?%s" % (request_url, params))      else:          f = urllib.urlopen(request_url, params)        content = f.read()      print content      def main():        domain="http://api.xiaocongjisuan.com/"      servlet="data/chinesekeyword/analysis"      method="get"      request_url=domain+servlet        #字典      params ={}      params["appKey"]=app_key      params["openId"]=open_id        #变动部分      params["content"]="我是一个中国人,你知道嘛"        request_content(request_url,params,method)    if __name__ == '__main__':      main()  

java 为例:

package com.xiaocongjisuan.module.example;    import java.io.BufferedReader;  import java.io.DataOutputStream;  import java.io.IOException;  import java.io.InputStream;  import java.io.InputStreamReader;  import java.io.UnsupportedEncodingException;  import java.net.HttpURLConnection;  import java.net.URL;  import java.net.URLEncoder;  import java.util.HashMap;  import java.util.Map;    public class Application {         public static final String DEF_CHATSET = "UTF-8";       public static final int DEF_CONN_TIMEOUT = 30000;       public static final int DEF_READ_TIMEOUT = 30000;       public static String userAgent =  "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36";         //配置您申请的appKey和openId       public static final String APP_KEY ="yours";       public static final String OPEN_ID ="yours";         //将map型转为请求参数型       public static String urlEncode(Map<String,Object> params) {            if(params==null){return "";};            StringBuilder sb = new StringBuilder();          for (Map.Entry<String,Object> i : params.entrySet()) {              try {                  sb.append(i.getKey()).append("=").append(URLEncoder.encode(i.getValue()+"","UTF-8")).append("&");              } catch (UnsupportedEncodingException e) {                  e.printStackTrace();              }          }          String r=sb.toString();          if(r.endsWith("&")){              r = r.substring(0,r.length()-1);          }          return r;       }         /**       *       * @param requestUrl 请求地址       * @param params 请求参数       * @param method 请求方法       * @return 请求结果       * @throws Exception       */       public static String requestContent(String requestUrl, Map<String,Object> params,String method) throws Exception {            HttpURLConnection conn = null;          BufferedReader reader = null;          String rs = null;          try {                //组装请求链接              StringBuffer sb = new StringBuffer();                if(method!=null&&method.equalsIgnoreCase("get")){                  requestUrl = requestUrl+"?"+urlEncode(params);              }                //默认get              URL url = new URL(requestUrl);              conn = (HttpURLConnection) url.openConnection();              conn.setRequestMethod("GET");                if(method!=null&&method.equalsIgnoreCase("post")){                   conn.setRequestMethod("POST");                   conn.setDoOutput(true);                   conn.setDoInput(true);              }                //参数配置              conn.setRequestProperty("User-agent", userAgent);              conn.setUseCaches(false);              conn.setConnectTimeout(DEF_CONN_TIMEOUT);              conn.setReadTimeout(DEF_READ_TIMEOUT);              conn.setInstanceFollowRedirects(false);              conn.connect();                if (params!= null && method.equalsIgnoreCase("post")) {                  try {                      DataOutputStream out = new DataOutputStream(conn.getOutputStream());                      out.writeBytes(urlEncode(params));                  } catch (Exception e) {                      e.printStackTrace();                  }              }                //读取数据              InputStream is = conn.getInputStream();              reader = new BufferedReader(new InputStreamReader(is, DEF_CHATSET));              String strRead = null;              while ((strRead = reader.readLine()) != null) {                  sb.append(strRead);              }              rs = sb.toString();            } catch (IOException e) {              e.printStackTrace();          } finally {              if (reader != null) {                  reader.close();              }              if (conn != null) {                  conn.disconnect();              }          }          return rs;      }          public static void main(String[] args) throws Exception{            String domain="http://api.xiaocongjisuan.com/";          String servlet="data/skydriverdata/get";          String method="get";            String requestUrl=domain+servlet;          Map<String,Object> params=new HashMap<String,Object>();          params.put("appKey",APP_KEY);          params.put("openId",OPEN_ID);            //变动部分          params.put("q","a");          params.put("currentPage",1);          params.put("pageSize",20);              String result=requestContent(requestUrl,params,method);          System.out.println(result);      }  }

原理主要是调用接口,直接输入一串字符串,然后接口会自动把结果以json或者xml的形式返回,具体文档可以点我查看。这种实现方式很简单,省去了大量的开发时间,屏蔽了语言之间的差异性,值得推荐。