web安全之機器學習入門——3.2 決策
- 2020 年 1 月 16 日
- 筆記
目錄
簡介
決策樹簡單用法
決策樹檢測P0P3爆破
決策樹檢測FTP爆破
隨機森林檢測FTP爆破
簡介
決策樹和隨機森林算法是最常見的分類算法;
決策樹,判斷的邏輯很多時候和人的思維非常接近。
隨機森林算法,利用多棵決策樹對樣本進行訓練並預測的一種分類器,並且其輸出的類別是由個別決策樹輸出的類別的眾數決定。
決策樹簡單用法
使用sklearn自帶的iris數據集
# -*- coding: utf-8 -*- from sklearn.datasets import load_iris from sklearn import tree import pydotplus """ 如果報錯GraphViz's executables not found,手動添加環境變量 """ import os os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/' #注意修改你的路徑
iris = load_iris() clf = tree.DecisionTreeClassifier() clf = clf.fit(iris.data, iris.target) #可視化訓練得到的決策樹 dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("../photo/6/iris.pdf")

決策樹算法檢測P0P3爆破
# -*- coding:utf-8 -*- import re from sklearn import cross_validation from sklearn import tree import pydotplus import os os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/' #注意修改你的路徑 """ 收集並清洗數據 """ def load_kdd99(filename): x=[] with open(filename) as f: for line in f: line=line.strip('n') line=line.split(',') x.append(line) return x def get_guess_passwdandNormal(x): v=[] w=[] y=[] """ 篩選標記為guess-passwd和normal且是P0P3協議的數據 """ for x1 in x: if ( x1[41] in ['guess_passwd.','normal.'] ) and ( x1[2] == 'pop_3' ): if x1[41] == 'guess_passwd.': y.append(1) else: y.append(0) """ 特徵化 挑選與p0p3密碼破解相關的網絡特徵以及TCP協議內容的特徵作為樣本特徵 """ x1 = [x1[0]] + x1[4:8]+x1[22:30] v.append(x1) for x1 in v : v1=[] for x2 in x1: v1.append(float(x2)) w.append(v1) return w,y if __name__ == '__main__': v=load_kdd99("../data/kddcup99/corrected") x,y=get_guess_passwdandNormal(v) """ 訓練樣本 實例化決策樹算法 """ clf = tree.DecisionTreeClassifier() #十折交叉驗證 print(cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10)) clf = clf.fit(x, y) dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("../photo/6/iris-dt.pdf")
準確率達到99%
[ 0.98637602 1. 1. 1. 1. 1. 1. 1. 1. 1. ]
可視化結果

決策樹算法檢測FTP爆破
# -*- coding:utf-8 -*- import re import os from sklearn.feature_extraction.text import CountVectorizer from sklearn import cross_validation import os from sklearn import tree import pydotplus """ """ def load_one_flle(filename): x=[] with open(filename) as f: line=f.readline() line=line.strip('n') return line """ 加載ADFA-LD中的正常樣本數據 """ def load_adfa_training_files(rootdir): x=[] y=[] list = os.listdir(rootdir) for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) if os.path.isfile(path): x.append(load_one_flle(path)) y.append(0) return x,y """ 定義遍歷目錄下文件的函數,作為load_adfa_hydra_ftp_files的子函數 """ def dirlist(path, allfile): filelist = os.listdir(path) for filename in filelist: filepath = os.path.join(path, filename) if os.path.isdir(filepath): dirlist(filepath, allfile) else: allfile.append(filepath) return allfile """ 從攻擊數據集中篩選和FTP爆破相關的數據 """ def load_adfa_hydra_ftp_files(rootdir): x=[] y=[] allfile=dirlist(rootdir,[]) for file in allfile: """ rootdir下有多個文件,多個文件里又有多個文件 """ if re.match(r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_d+\UAD-Hydra-FTP*",file): x.append(load_one_flle(file)) y.append(1) return x,y if __name__ == '__main__': """ 特徵化 由於ADFA-LD數據集都記錄了函數調用的序列,每個文件包含的函數調用序列的個數都不一致 """ x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/") #x1{2184×833} y1{833} x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/") #x2{524×162} y2{162} x=x1+x2 y=y1+y2 #x{2184×995} y{955} vectorizer = CountVectorizer(min_df=1) #min_df如果某個詞的document frequence小於min_df,則這個詞不會被當作關鍵詞 x=vectorizer.fit_transform(x) x=x.toarray() #x{142×955} #實例化決策樹算法 clf = tree.DecisionTreeClassifier() #效果驗證 print(cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10)) clf = clf.fit(x, y) dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("../photo/6/ftp.pdf")
[ 1. 0.98019802 0.95 0.97979798 0.96969697 0.88888889 0.98989899 0.95959596 0.92929293 0.95959596]

隨機森林算法檢測FTP爆破
# -*- coding:utf-8 -*- #pydotplus只支持決策樹 import re import os from sklearn.feature_extraction.text import CountVectorizer from sklearn import cross_validation import os from sklearn import tree from sklearn.ensemble import RandomForestClassifier import numpy as np def load_one_flle(filename): x=[] with open(filename) as f: line=f.readline() line=line.strip('n') return line def load_adfa_training_files(rootdir): x=[] y=[] list = os.listdir(rootdir) for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) if os.path.isfile(path): x.append(load_one_flle(path)) y.append(0) return x,y def dirlist(path, allfile): filelist = os.listdir(path) for filename in filelist: filepath = os.path.join(path, filename) if os.path.isdir(filepath): dirlist(filepath, allfile) else: allfile.append(filepath) return allfile def load_adfa_hydra_ftp_files(rootdir): x=[] y=[] allfile=dirlist(rootdir,[]) for file in allfile: if re.match(r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_d+\UAD-Hydra-FTP*",file): x.append(load_one_flle(file)) y.append(1) return x,y if __name__ == '__main__': x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/") x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/") x=x1+x2 y=y1+y2 #print(x) vectorizer = CountVectorizer(min_df=1) x=vectorizer.fit_transform(x) x=x.toarray() #print(y) #選用決策樹分類器 clf1 = tree.DecisionTreeClassifier() score=cross_validation.cross_val_score(clf1, x, y, n_jobs=-1, cv=10) print('決策樹',np.mean(score)) #選用隨機森林分類器 clf2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0) score=cross_validation.cross_val_score(clf2, x, y, n_jobs=-1, cv=10) print('隨機森林',np.mean(score))
決策樹 0.955736173617 隨機森林 0.984888688869