­

web安全之機器學習入門——3.2 決策

  • 2020 年 1 月 16 日
  • 筆記

目錄

簡介

決策樹簡單用法

決策樹檢測P0P3爆破

決策樹檢測FTP爆破

隨機森林檢測FTP爆破


簡介

決策樹和隨機森林算法是最常見的分類算法;

決策樹,判斷的邏輯很多時候和人的思維非常接近。

隨機森林算法,利用多棵決策樹對樣本進行訓練並預測的一種分類器,並且其輸出的類別是由個別決策樹輸出的類別的眾數決定。


決策樹簡單用法

使用sklearn自帶的iris數據集

# -*- coding: utf-8 -*-  from sklearn.datasets import load_iris  from sklearn import tree  import pydotplus  """  如果報錯GraphViz's executables not found,手動添加環境變量  """  import os  os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'  #注意修改你的路徑
iris = load_iris()    clf = tree.DecisionTreeClassifier()  clf = clf.fit(iris.data, iris.target)    #可視化訓練得到的決策樹  dot_data = tree.export_graphviz(clf, out_file=None)  graph = pydotplus.graph_from_dot_data(dot_data)  graph.write_pdf("../photo/6/iris.pdf")

決策樹算法檢測P0P3爆破

# -*- coding:utf-8 -*-    import re  from sklearn import cross_validation  from sklearn import tree  import pydotplus  import os  os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'  #注意修改你的路徑    """  收集並清洗數據  """  def load_kdd99(filename):      x=[]      with open(filename) as f:          for line in f:              line=line.strip('n')              line=line.split(',')              x.append(line)      return x    def get_guess_passwdandNormal(x):      v=[]      w=[]      y=[]  """  篩選標記為guess-passwd和normal且是P0P3協議的數據  """      for x1 in x:          if ( x1[41] in ['guess_passwd.','normal.'] ) and ( x1[2] == 'pop_3' ):              if x1[41] == 'guess_passwd.':                  y.append(1)              else:                  y.append(0)  """  特徵化  挑選與p0p3密碼破解相關的網絡特徵以及TCP協議內容的特徵作為樣本特徵  """              x1 = [x1[0]] + x1[4:8]+x1[22:30]              v.append(x1)      for x1 in v :          v1=[]          for x2 in x1:              v1.append(float(x2))          w.append(v1)      return w,y    if __name__ == '__main__':      v=load_kdd99("../data/kddcup99/corrected")      x,y=get_guess_passwdandNormal(v)  """  訓練樣本  實例化決策樹算法  """      clf = tree.DecisionTreeClassifier()      #十折交叉驗證      print(cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10))        clf = clf.fit(x, y)      dot_data = tree.export_graphviz(clf, out_file=None)      graph = pydotplus.graph_from_dot_data(dot_data)      graph.write_pdf("../photo/6/iris-dt.pdf")

準確率達到99%

[ 0.98637602  1.          1.          1.          1.          1.          1.    1.          1.          1.        ]

可視化結果


決策樹算法檢測FTP爆破

# -*- coding:utf-8 -*-    import re  import os  from sklearn.feature_extraction.text import CountVectorizer  from sklearn import cross_validation  import os  from sklearn import tree  import pydotplus    """    """  def load_one_flle(filename):      x=[]      with open(filename) as f:          line=f.readline()          line=line.strip('n')      return line    """  加載ADFA-LD中的正常樣本數據  """  def load_adfa_training_files(rootdir):      x=[]      y=[]      list = os.listdir(rootdir)      for i in range(0, len(list)):          path = os.path.join(rootdir, list[i])          if os.path.isfile(path):              x.append(load_one_flle(path))              y.append(0)      return x,y    """  定義遍歷目錄下文件的函數,作為load_adfa_hydra_ftp_files的子函數  """  def dirlist(path, allfile):      filelist = os.listdir(path)        for filename in filelist:          filepath = os.path.join(path, filename)          if os.path.isdir(filepath):              dirlist(filepath, allfile)          else:              allfile.append(filepath)      return allfile    """  從攻擊數據集中篩選和FTP爆破相關的數據  """  def load_adfa_hydra_ftp_files(rootdir):      x=[]      y=[]      allfile=dirlist(rootdir,[])      for file in allfile:          """          rootdir下有多個文件,多個文件里又有多個文件          """          if re.match(r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_d+\UAD-Hydra-FTP*",file):              x.append(load_one_flle(file))              y.append(1)      return x,y        if __name__ == '__main__':      """      特徵化      由於ADFA-LD數據集都記錄了函數調用的序列,每個文件包含的函數調用序列的個數都不一致      """      x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/")      #x1{2184×833}  y1{833}      x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/")      #x2{524×162} y2{162}        x=x1+x2      y=y1+y2      #x{2184×995} y{955}      vectorizer = CountVectorizer(min_df=1)      #min_df如果某個詞的document frequence小於min_df,則這個詞不會被當作關鍵詞      x=vectorizer.fit_transform(x)      x=x.toarray()      #x{142×955}      #實例化決策樹算法      clf = tree.DecisionTreeClassifier()      #效果驗證      print(cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10))          clf = clf.fit(x, y)      dot_data = tree.export_graphviz(clf, out_file=None)      graph = pydotplus.graph_from_dot_data(dot_data)      graph.write_pdf("../photo/6/ftp.pdf")
[ 1.          0.98019802  0.95        0.97979798  0.96969697  0.88888889    0.98989899  0.95959596  0.92929293  0.95959596]

隨機森林算法檢測FTP爆破

# -*- coding:utf-8 -*-  #pydotplus只支持決策樹  import re  import os  from sklearn.feature_extraction.text import CountVectorizer  from sklearn import cross_validation  import os  from sklearn import tree  from sklearn.ensemble import RandomForestClassifier  import numpy as np      def load_one_flle(filename):      x=[]      with open(filename) as f:          line=f.readline()          line=line.strip('n')      return line    def load_adfa_training_files(rootdir):      x=[]      y=[]      list = os.listdir(rootdir)      for i in range(0, len(list)):          path = os.path.join(rootdir, list[i])          if os.path.isfile(path):              x.append(load_one_flle(path))              y.append(0)      return x,y    def dirlist(path, allfile):      filelist = os.listdir(path)        for filename in filelist:          filepath = os.path.join(path, filename)          if os.path.isdir(filepath):              dirlist(filepath, allfile)          else:              allfile.append(filepath)      return allfile    def load_adfa_hydra_ftp_files(rootdir):      x=[]      y=[]      allfile=dirlist(rootdir,[])      for file in allfile:          if re.match(r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_d+\UAD-Hydra-FTP*",file):              x.append(load_one_flle(file))              y.append(1)      return x,y        if __name__ == '__main__':        x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/")      x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/")        x=x1+x2      y=y1+y2      #print(x)      vectorizer = CountVectorizer(min_df=1)      x=vectorizer.fit_transform(x)      x=x.toarray()      #print(y)      #選用決策樹分類器      clf1 = tree.DecisionTreeClassifier()      score=cross_validation.cross_val_score(clf1, x, y, n_jobs=-1, cv=10)      print('決策樹',np.mean(score))      #選用隨機森林分類器      clf2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)      score=cross_validation.cross_val_score(clf2, x, y, n_jobs=-1, cv=10)      print('隨機森林',np.mean(score))
決策樹 0.955736173617  隨機森林 0.984888688869