web系统安全运营之基础- 基于DFA算法的高性能的敏感词,脏词的检测过滤算法类(c#).
【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词.. 这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。
废话少说,先看下代码,可以拿过去直接使用。
1 using Microsoft.VisualBasic; 2 using System; 3 using System.Collections.Generic; 4 using System.IO; 5 using System.Linq; 6 using System.Text; 7 8 namespace OpenCore.ContentSecurity 9 { 10 /// <summary> 11 /// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容) 12 /// 开发前参考内容://blog.csdn.net/u011966339/article/details/72832197 13 /// 更新日志: 14 /// 2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能. 15 /// 支持多词库文件加载. 16 /// 优化了算法的细节,提高健壮性。 17 /// </summary> 18 public class SensitiveWordFilter 19 { 20 private static string[] dictionaryPathList = null; 21 /// <summary> 22 /// 内存词典 23 /// </summary> 24 private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue]; 25 private static object lockObj = new object(); 26 public static void Init(string[] sDictionaryFileName) 27 { 28 dictionaryPathList = sDictionaryFileName; 29 LoadDictionary(); 30 } 31 public SensitiveWordFilter() 32 { 33 34 } 35 private string sourctText = string.Empty; 36 /// <summary> 37 /// 检测源 38 /// </summary> 39 private string SourctText 40 { 41 get { return sourctText; } 42 set { sourctText = value; } 43 } 44 /// <summary> 45 /// 检测源游标 46 /// </summary> 47 private int cursor = 0; 48 /// <summary> 49 /// 匹配成功后偏移量 50 /// </summary> 51 private int wordlenght = 0; 52 /// <summary> 53 /// 检测词游标 54 /// </summary> 55 private int nextCursor = 0; 56 private List<string> illegalWords = new List<string>(); 57 /// <summary> 58 /// 检测到的非法词集 59 /// </summary> 60 public List<string> IllegalWords 61 { 62 get { return illegalWords; } 63 } 64 /// <summary> 65 /// 判断是否是中文 66 /// </summary> 67 /// <param name="character"></param> 68 /// <returns></returns> 69 private bool isCHS(char character) 70 { 71 // 中文表意字符的范围 4E00-9FA5 72 int charVal = (int)character; 73 return (charVal >= 0x4e00 && charVal <= 0x9fa5); 74 } 75 /// <summary> 76 /// 判断是否是数字 77 /// </summary> 78 /// <param name="character"></param> 79 /// <returns></returns> 80 private bool isNum(char character) 81 { 82 int charVal = (int)character; 83 return (charVal >= 48 && charVal <= 57); 84 } 85 /// <summary> 86 /// 判断是否是字母 87 /// </summary> 88 /// <param name="character"></param> 89 /// <returns></returns> 90 private bool isAlphabet(char character) 91 { 92 int charVal = (int)character; 93 return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90)); 94 } 95 /// <summary> 96 /// 转半角小写的函数(DBC case) 97 /// </summary> 98 /// <param name="input">任意字符串</param> 99 /// <returns>半角字符串</returns> 100 ///<remarks> 101 ///全角空格为12288,半角空格为32 102 ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248 103 ///</remarks> 104 private static string ToDBC(string input) 105 { 106 char[] c = input.ToCharArray(); 107 for (int i = 0; i < c.Length; i++) 108 { 109 if (c[i] == 12288) 110 { 111 c[i] = (char)32; 112 continue; 113 } 114 if (c[i] > 65280 && c[i] < 65375) 115 c[i] = (char)(c[i] - 65248); 116 } 117 return new string(c).ToLower(); 118 } 119 /// <summary> 120 /// 转换为简体中文 121 /// </summary> 122 /// <param name="sInput"></param> 123 /// <returns></returns> 124 private static string ToSimplifiedChiniese(string sInput) 125 { 126 if (string.IsNullOrEmpty(sInput)) 127 { 128 return string.Empty; 129 } 130 try 131 { 132 return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0); 133 } 134 catch (Exception ex) 135 { 136 137 } 138 return sInput; 139 } 140 /// <summary> 141 /// 写入日志(非跨程序域的场景) 142 /// </summary> 143 /// <param name="Msg"></param> 144 private static void SaveLog(string Msg) 145 { 146 string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog"); 147 if (!Directory.Exists(sPath)) 148 { 149 Directory.CreateDirectory(sPath); 150 } 151 sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log"); 152 try 153 { 154 File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n"); 155 } 156 catch 157 { 158 } 159 } 160 /// <summary> 161 /// 加载内存词库 162 /// </summary> 163 private static void LoadDictionary() 164 { 165 if (dictionaryPathList == null || dictionaryPathList.Length == 0) 166 { 167 SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空"); 168 return; 169 } 170 foreach (string sFileName in dictionaryPathList) 171 { 172 if (File.Exists(sFileName) == false) 173 { 174 SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件"); 175 return; 176 } 177 } 178 List<string> wordList = new List<string>(); 179 Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length); 180 foreach (string sDictionaryFile in dictionaryPathList) 181 { 182 string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default); 183 foreach (string word in words) 184 { 185 if (string.IsNullOrEmpty(word)) 186 continue; 187 if (word.Trim().Length == 0) 188 continue; 189 string key = ToDBC(word); 190 wordList.Add(key); 191 //适配繁体,简体.addbyww@2020-4-15 192 string key_simple = ToSimplifiedChiniese(key); 193 if (key_simple != key) 194 { 195 wordList.Add(key_simple); 196 } 197 } 198 } 199 Comparison<string> cmp = delegate (string key1, string key2) 200 { 201 return key1.CompareTo(key2); 202 }; 203 wordList.Sort(cmp); 204 for (int i = wordList.Count - 1; i > 0; i--) 205 { 206 if (wordList[i].ToString() == wordList[i - 1].ToString()) 207 { 208 wordList.RemoveAt(i); 209 } 210 } 211 foreach (var word in wordList) 212 { 213 if (word.Length > 0) 214 { 215 WordGroup group = MEMORYLEXICON[(int)word[0]]; 216 if (group == null) 217 { 218 group = new WordGroup(); 219 MEMORYLEXICON[(int)word[0]] = group; 220 } 221 group.Add(word.Substring(1)); 222 } 223 } 224 } 225 /// <summary> 226 /// 检测 227 /// </summary> 228 /// <param name="blackWord"></param> 229 /// <returns></returns> 230 private bool Check(string blackWord) 231 { 232 wordlenght = 0; 233 //检测源下一位游标 234 nextCursor = cursor + 1; 235 bool found = false; 236 //遍历词的每一位做匹配 237 for (int i = 0; i < blackWord.Length; i++) 238 { 239 //特殊字符偏移游标 240 int offset = 0; 241 if (nextCursor >= sourctText.Length) 242 { 243 break; 244 } 245 else 246 { 247 //检测下位字符如果不是汉字 数字 字符 偏移量加1 248 for (int y = nextCursor; y < sourctText.Length; y++) 249 { 250 251 if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y])) 252 { 253 offset++; 254 //避让特殊字符,下位游标如果>=字符串长度 跳出 255 if (nextCursor + offset >= sourctText.Length) break; 256 wordlenght++; 257 } 258 else break; 259 } 260 if ((int)blackWord[i] == (int)sourctText[nextCursor + offset]) 261 { 262 found = true; 263 } 264 else 265 { 266 found = false; 267 break; 268 } 269 } 270 nextCursor = nextCursor + 1 + offset; 271 wordlenght++; 272 } 273 return found; 274 } 275 /// <summary> 276 /// 检测并替换敏感词为指定字符。之后返回 277 /// </summary> 278 /// <param name="replaceChar">比如:*</param> 279 public string getDataByFilter(string sSourceInput, char replaceChar) 280 { 281 if (string.IsNullOrEmpty(sSourceInput)) 282 { 283 return sSourceInput; 284 } 285 if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0) 286 { 287 SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空"); 288 return sSourceInput; 289 } 290 //初始化 291 this.cursor = 0; 292 this.wordlenght = 0; 293 this.illegalWords.Clear(); 294 this.sourctText = sSourceInput; 295 if (sourctText != string.Empty) 296 { 297 char[] tempString = sourctText.ToCharArray(); 298 for (int i = 0; i < SourctText.Length; i++) 299 { 300 //查询以该字为首字符的词组 301 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]]; 302 if (group != null) 303 { 304 for (int z = 0; z < group.Count(); z++) 305 { 306 string word = group.GetWord(z); 307 if (word.Length == 0 || Check(word)) 308 { 309 string blackword = string.Empty; 310 for (int pos = 0; pos < wordlenght + 1; pos++) 311 { 312 blackword += tempString[pos + cursor].ToString(); 313 tempString[pos + cursor] = replaceChar; 314 } 315 illegalWords.Add(blackword); 316 cursor = cursor + wordlenght; 317 i = i + wordlenght; 318 } 319 } 320 } 321 cursor++; 322 } 323 return new string(tempString); 324 } 325 else 326 { 327 return string.Empty; 328 } 329 } 330 } 331 /// <summary> 332 /// 具有相同首字符的词组集合 333 /// </summary> 334 public class WordGroup 335 { 336 /// <summary> 337 /// 集合 338 /// </summary> 339 private List<string> groupList=new List<string>(); 340 public WordGroup() 341 { 342 343 } 344 /// <summary> 345 /// 添加词 346 /// </summary> 347 /// <param name="word"></param> 348 public void Add(string word) 349 { 350 if (groupList.Contains(word) == false) 351 { 352 groupList.Add(word); 353 } 354 } 355 /// <summary> 356 /// 获取总数 357 /// </summary> 358 /// <returns></returns> 359 public int Count() 360 { 361 return groupList.Count; 362 } 363 /// <summary> 364 /// 根据下标获取词 365 /// </summary> 366 /// <param name="index"></param> 367 /// <returns></returns> 368 public string GetWord(int index) 369 { 370 return groupList[index]; 371 } 372 } 373 }
上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:
1 //全局配置,整个程序只要配置一次即可,后续无需配置 2 SensitiveWordFilter.Init(new string[] { 3 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt", 4 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt", 5 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt", 6 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt", 7 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt", 8 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt" 9 }); 10 //下列可以在多个地方实例化,可以并发执行 11 SensitiveWordFilter wordFilter = new SensitiveWordFilter(); 12 Dictionary<string, string> dictTestData = new Dictionary<string, string>(); 13 //多测几个示例,看看效果 14 dictTestData["杀^人游戏,有人找一夜q"] = ""; 15 dictTestData["数学学习课堂"] = ""; 16 dictTestData["打击法0功有,法0功毒害大众"] = ""; 17 Dictionary<string, string> dictResult = new Dictionary<string, string>(); 18 foreach(string sKey in dictTestData.Keys) 19 { 20 dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')}, ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}"; 21 } 22 string sResultJson = JsonConverter.SerializeObject(dictResult); 23 Utils.SaveLog(sResultJson);
最后,给一下打印的结果:
“杀^人游戏,有人找一夜q”: 替换后: “杀^人游戏,有人找|||”, ————检测违禁词:一夜q”,
“数学学习课堂”: 替换后:”数学学习课堂”, ————检测违禁词:,
“打击法0功有,法0功毒害大众”: 替换后:“打击|||有,|||毒害大众”, ————检测违禁词:法0功,法0功”
————-附
词库下载地址://codeload.github.com/chason777777/mgck/zip/master