研究開源源碼之Myrmec

  • 2020 年 3 月 10 日
  • 筆記

  好久沒寫部落格了,自己也弄不清是懶了還是忙了。畢竟白天需要工作,晚上有時候看看資料,有時候陪家人,有時候約朋友……更加累了,可能由於累了就懶得總結了。

  今天有同事問我關於程式碼檢查文件類型的問題。當然由於安全性不能直接使用文件後綴或者Mime檢查,需要讀取文件頭,根據文件頭來判斷(作為判斷的其中一種依據)。後來發現直接使用Myrmec是沒有CSS和HTML文件檢查的。不應該啊!無賴中只能上Github看看工具源碼了。

  Myrmec 是一個用於檢測文件格式的庫,Myrmec不同於其它庫或者手寫檢測程式碼,Myrmec不依賴文件擴展名(在實際使用中,你的用戶很可能使用虛假的擴展名欺騙你的應用程式),Myrmec會檢測文件的二進位頭,並在其元資料庫中匹配來獲得文件的格式。

  例如Jpg圖片的二進位頭是 “FF D8 FF DB” 那麼Myrmec會匹配到這個文件頭,並獲得兩個結果–“jpg”和”jpeg”。

  具體使用我就不詳細說了,畢竟百度就有,而且比較簡單(真的很簡單,哈哈)。直接上源碼地址https://github.com/rocketRobin/myrmec

  

  程式碼其實也是比較簡單的,其中FileTypes是已經標記為過期的類,裡面定義了很多標記文件類型的16進位,用List<Record>存儲。

  

  1 // <copyright file="FileTypes.cs" company="Rocket Robin">    2 // Copyright (c) Rocket Robin. All rights reserved.    3 // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.    4 // </copyright>    5    6 using System;    7 using System.Collections.Generic;    8    9 namespace Myrmec   10 {   11     /// <summary>   12     /// Common file types for populate a new sniffer instance.   13     /// </summary>   14   15     [Obsolete("please use populate the file types only you need.")]   16     public class FileTypes   17     {   18         static FileTypes()   19         {   20   21             Unfrequent = new List<Record>   22             {   23                 new Record("bin", "53 50 30 31"),   24                 new Record("bac", "42 41 43 4B 4D 49 4B 45 44 49 53 4B"),   25                 new Record("bz2", "42 5A 68"),   26                 new Record("tif tiff", "49 49 2A 00"),   27                 new Record("tif tiff", "4D 4D 00 2A"),   28                 new Record("cr2", "49 49 2A 00 10 00 00 00 43 52"),   29                 new Record("cin", "80 2A 5F D7"),   30                 new Record("exr", "76 2F 31 01"),   31                 new Record("dpx", "53 44 50 58"),   32                 new Record("dpx", "58 50 44 53"),   33                 new Record("bpg", "42 50 47 FB"),   34                 new Record("lz", "4C 5A 49 50"),   35                 new Record("ps", "25 21 50 53"),   36                 new Record("fits", "3D 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 54"),   37                 new Record("doc xls ppt msg", "D0 CF 11 E0 A1 B1 1A E1"),   38                 new Record("dex", "64 65 78 0A 30 33 35 00"),   39                 new Record("vmdk", "4B 44 4D"),   40                 new Record("crx", "43 72 32 34"),   41                 new Record("cwk", "05 07 00 00 42 4F 42 4F 05 07 00 00 00 00 00 00 00 00 00 00 00 01"),   42                 new Record("fh8", "41 47 44 33"),   43                 new Record("cwk", "06 07 E1 00 42 4F 42 4F 06 07 E1 00 00 00 00 00 00 00 00 00 00 01"),   44                 new Record("toast", "45 52 02 00 00 00"),   45                 new Record("toast", "8B 45 52 02 00 00 00"),   46                 new Record("xar", "78 61 72 21"),   47                 new Record("dat", "50 4D 4F 43 43 4D 4F 43"),   48                 new Record("nes", "4E 45 53 1A"),   49                 new Record("tox", "74 6F 78 33"),   50                 new Record("MLV", "4D 4C 56 49"),   51                 new Record("lz4", "04 22 4D 18"),   52                 new Record("cab", "4D 53 43 46"),   53                 new Record("flif", "46 4C 49 46"),   54                 new Record("stg", "4D 49 4C 20"),   55                 new Record("der", "30 82"),   56                 new Record("wasm", "00 61 73 6d"),   57                 new Record("lep", "cf 84 01"),   58                 new Record("rtf", "7B 5C 72 74 66 31"),   59                 new Record("m2p vob", "00 00 01 BA"),   60                 new Record("zlib", "78 01"),   61                 new Record("zlib", "78 9c"),   62                 new Record("zlib", "78 da"),   63                 new Record("lzfse", "62 76 78 32"),   64                 new Record("orc", "4F 52 43"),   65                 new Record("avro", "4F 62 6A 01"),   66                 new Record("rc", "53 45 51 36"),   67                 new Record("tbi", "00 00 00 00 14 00 00 00"),   68                 new Record("dat", "00 00 00 00 62 31 05 00 09 00 00 00 00 20 00 00 00 09 00 00 00 00 00 00", 8, "Bitcoin Core wallet.dat file"),   69                 new Record("jp2", "00 00 00 0C 6A 50 20 20 0D 0A", "Various JPEG-2000 image file formats"),   70                 new Record("ttf", "00 01 00 00 00"),   71                 new Record("mdf", "00 FF FF FF FF FF FF FF FF FF FF 00 00 02 00 01"),   72   73                 // Complex file type.   74                 new Record("pdb", "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00", 11),   75                 new Record("3gp 3g2", "66 74 79 70 33 67", 4),   76                 new Record("iso", "43 44 30 30 31", 32769),   77                 new Record("iso", "43 44 30 30 31", 34817),   78                 new Record("iso", "43 44 30 30 31", 36865),   79             };   80             Common = new List<Record>   81             {   82                 new Record("asf wma wmv", "30 26 B2 75 8E 66 CF 11 A6 D9 00 AA 00 62 CE 6C"),   83                 new Record("ogg oga ogv", "4F 67 67 53"),   84                 new Record("psd", "38 42 50 53"),   85                 new Record("mp3", "FF FB"),   86                 new Record("mp3", "49 44 33"),   87                 new Record("bmp dib", "42 4D"),   88                 new Record("jpg,jpeg", "ff,d8,ff,db"),   89                 new Record("png", "89,50,4e,47,0d,0a,1a,0a"),   90                 new Record("zip,jar,odt,ods,odp,docx,xlsx,pptx,vsdx,apk,aar", "50,4b,03,04"),   91                 new Record("zip,jar,odt,ods,odp,docx,xlsx,pptx,vsdx,apk,aar", "50,4b,07,08"),   92                 new Record("zip,jar,odt,ods,odp,docx,xlsx,pptx,vsdx,apk,aar", "50,4b,05,06"),   93                 new Record("rar", "52,61,72,21,1a,07,00"),   94                 new Record("rar", "52,61,72,21,1a,07,01,00"),   95                 new Record("class", "CA FE BA BE"),   96                 new Record("pdf", "25 50 44 46"),   97                 new Record("rpm", "ed ab ee db"),   98                 new Record("flac", "66 4C 61 43"),   99                 new Record("mid midi", "4D 54 68 64"),  100                 new Record("ico", "00 00 01 00"),  101                 new Record("z,tar.z", "1F 9D"),  102                 new Record("z,tar.z", "1F A0"),  103                 new Record("gif", "47 49 46 38 37 61"),  104                 new Record("dmg", "78 01 73 0D 62 62 60"),  105                 new Record("gif", "47 49 46 38 39 61"),  106                 new Record("exe", "4D 5A"),  107                 new Record("tar", "75 73 74 61 72", 257),  108                 new Record("mkv mka mks mk3d webm", "1A 45 DF A3"),  109                 new Record("gz tar.gz", "1F 8B"),  110                 new Record("xz tar.xz", "FD 37 7A 58 5A 00 00"),  111                 new Record("7z", "37 7A BC AF 27 1C"),  112                 new Record("mpg mpeg", "00 00 01 BA"),  113                 new Record("mpg mpeg", "00 00 01 B3"),  114                 new Record("woff", "77 4F 46 46"),  115                 new Record("woff2", "77 4F 46 32"),  116                 new Record("XML", "3c 3f 78 6d 6c 20"),  117                 new Record("swf", "43 57 53"),  118                 new Record("swf", "46 57 53"),  119                 new Record("deb", "21 3C 61 72 63 68 3E"),  120  121                 // complext  122                 new Record("jpg,jpeg","FF D8 FF E0 ?? ?? 4A 46 49 46 00 01"),  123                 new Record("jpg,jpeg","FF D8 FF E1 ?? ?? 45 78 69 66 00 00"),  124             };  125         }  126  127         /// <summary>  128         /// Gets CommonFileTypes.  129         /// Replace this with <see cref="Common"/>  130         /// </summary>  131         [Obsolete("please use populate the file types only you need.")]  132         public static List<Record> CommonFileTypes { get => Common; }  133  134         /// <summary>  135         /// Gets Common It contains the format of the file we often see.  136         /// </summary>  137         [Obsolete("please use populate the file types only you need.")]  138         public static List<Record> Common { get; set; }  139  140  141         /// <summary>  142         /// It contains unfrequent file formats.  143         /// </summary>  144         [Obsolete("please use populate the file types only you need.")]  145         public static List<Record> Unfrequent { get; set; }  146     }  147 }

FileTypes

  Record類是用於存儲文件類型和16進位對應。

  1 // <copyright file="Record.cs" company="Rocket Robin">    2 // Copyright (c) Rocket Robin. All rights reserved.    3 // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.    4 // </copyright>    5    6 namespace Myrmec    7 {    8     /// <summary>    9     /// Present one record.   10     /// </summary>   11     public class Record   12     {   13         /// <summary>   14         /// Initializes a new instance of the <see cref="Record"/> class.   15         /// </summary>   16         public Record()   17         {   18         }   19   20         /// <summary>   21         /// Initializes a new instance of the <see cref="Record"/> class.   22         /// </summary>   23         /// <param name="extentions">extentions string ,split with "," what if it has many.</param>   24         /// <param name="hex">hex string, split with ",".</param>   25         public Record(string extentions, string hex)   26         {   27             Extentions = extentions;   28             Hex = hex;   29         }   30   31         /// <summary>   32         /// Initializes a new instance of the <see cref="Record"/> class.   33         /// </summary>   34         /// <param name="extentions">Extentions format string.</param>   35         /// <param name="hex">File hex head format string.</param>   36         /// <param name="offset">Offset of this record.</param>   37         public Record(string extentions, string hex, int offset)   38         {   39             Offset = offset;   40             Extentions = extentions;   41             Hex = hex;   42         }   43   44         /// <summary>   45         /// Initializes a new instance of the <see cref="Record"/> class.   46         /// </summary>   47         /// <param name="extentions">extentions string ,split with "," what if it has many.</param>   48         /// <param name="hex">hex string, split with ",".</param>   49         /// <param name="description">description</param>   50         public Record(string extentions, string hex, string description)   51         {   52             Extentions = extentions;   53             Hex = hex;   54             Description = description;   55         }   56   57         /// <summary>   58         /// Initializes a new instance of the <see cref="Record"/> class.   59         /// </summary>   60         /// <param name="extentions">extentions string ,split with "," what if it has many.</param>   61         /// <param name="hex">hex string, split with ",".</param>   62         /// <param name="offset"></param>   63         /// <param name="description">description</param>   64         public Record(string extentions, string hex, int offset, string description)   65         {   66             Offset = offset;   67             Extentions = extentions;   68             Hex = hex;   69             Description = description;   70         }   71   72         /// <summary>   73         /// Gets or sets Description   74         /// </summary>   75         public string Description { get; set; }   76   77         /// <summary>   78         /// Gets or sets file extentions.   79         /// </summary>   80         public string Extentions { get; set; }   81   82         /// <summary>   83         /// Gets or sets Hex String.   84         /// </summary>   85         public string Hex { get; set; }   86   87         /// <summary>   88         /// Gets or sets offset   89         /// </summary>   90         public int Offset { get; set; }   91   92         /// <summary>   93         /// Gets a value indicating whether this record has offset or contain a variable byte or not.   94         /// </summary>   95         public bool IsComplexMetadata   96         {   97             get => (Offset > 0) || (Hex.Contains("?"));   98         }   99     }  100 }

Record

  Node類用於存儲類型查詢樹,用於方便類型查詢的。

 1 // <copyright file="Node.cs" company="Rocket Robin">   2 // Copyright (c) Rocket Robin. All rights reserved.   3 // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.   4 // </copyright>   5   6 using System.Collections.Generic;   7   8 namespace Myrmec   9 {  10     /// <summary>  11     /// node  12     /// </summary>  13     public class Node  14     {  15         /// <summary>  16         /// Initializes a new instance of the <see cref="Node"/> class.  17         /// </summary>  18         public Node()  19         {  20         }  21  22         /// <summary>  23         /// Gets or sets children.  24         /// </summary>  25         public SortedList<byte, Node> Children { get; set; }  26  27         /// <summary>  28         /// Gets or sets depth.  29         /// </summary>  30         public int Depth { get; set; }  31  32         /// <summary>  33         /// Gets or sets extentions.  34         /// </summary>  35         public List<string> Extentions { get; set; }  36  37         /// <summary>  38         /// Gets or sets parent node.  39         /// </summary>  40         public Node Parent { get; set; }  41     }  42 }

Node

  Sniffer這個工具類是嗅探器用於提供根據文件16進位頭判斷類型等功能,很多核心演算法都是在這個類中,後面我詳細介紹這個演算法

  1 // <copyright file="Sniffer.cs" company="Rocket Robin">    2 // Copyright (c) Rocket Robin. All rights reserved.    3 // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.    4 // </copyright>    5    6 using System;    7 using System.Collections.Generic;    8 using System.Linq;    9   10 namespace Myrmec   11 {   12     /// <summary>   13     /// sniffer   14     /// </summary>   15     public class Sniffer   16     {   17         /// <summary>   18         /// You can get the file extention name detail in this wikipedia page.   19         /// </summary>   20         public const string FileExtentionHelpUrl = "https://en.wikipedia.org/wiki/List_of_file_signatures";   21   22         private Node _root;   23   24         /// <summary>   25         /// Initializes a new instance of the <see cref="Sniffer"/> class.   26         /// </summary>   27         public Sniffer()   28         {   29             _root = new Node()   30             {   31                 Children = new SortedList<byte, Node>(128),   32                 Depth = -1,   33             };   34             ComplexMetadata = new List<Metadata>(10);   35         }   36   37         /// <summary>   38         /// Gets or sets ComplexMetadatas.   39         /// </summary>   40         public List<Metadata> ComplexMetadata { get; set; }   41   42         /// <summary>   43         /// Add a record to matadata tree.   44         /// </summary>   45         /// <param name="data">file head.</param>   46         /// <param name="extentions">file extention list.</param>   47         public void Add(byte[] data, string[] extentions)   48         {   49             Add(data, _root, extentions, 0);   50         }   51   52         /// <summary>   53         ///   54         /// </summary>   55         /// <param name="record"></param>   56         public void Add(Record record)   57         {   58             if (record.IsComplexMetadata)   59             {   60                 ComplexMetadata.Add(record);   61             }   62             else   63             {   64                 Add(record.Hex.GetByte(), record.Extentions.Split(',', ' '));   65             }   66         }   67   68         /// <summary>   69         /// Find extentions that match the file hex head.   70         /// </summary>   71         /// <param name="data">file hex head</param>   72         /// <param name="matchAll">match all result or only the first.</param>   73         /// <returns>matched result</returns>   74         public List<string> Match(byte[] data, bool matchAll = false)   75         {   76             List<string> extentionStore = new List<string>(4);   77             Match(data, 0, _root, extentionStore, matchAll);   78   79             if (matchAll || !extentionStore.Any())   80             {   81                 // Match data from complex metadata.   82                 extentionStore.AddRange(ComplexMetadata.Match(data, matchAll));   83             }   84   85             // Remove repeated extentions.   86             if (matchAll && extentionStore.Any())   87             {   88                 extentionStore = extentionStore.Distinct().ToList();   89             }   90   91             return extentionStore;   92         }   93   94         private void Add(byte[] data, Node parent, string[] extentions, int depth)   95         {   96             Node current = null;   97   98             if (parent.Children == null)   99             {  100                 parent.Children = new SortedList<byte, Node>(Convert.ToInt32(128 / Math.Pow(2, depth)));  101             }  102  103             // if not contains current byte index, create node and put it into children.  104             if (!parent.Children.ContainsKey(data[depth]))  105             {  106                 current = new Node  107                 {  108                     Depth = depth,  109                     Parent = parent  110                 };  111                 parent.Children.Add(data[depth], current);  112             }  113             else  114             {  115                 if (!parent.Children.TryGetValue(data[depth], out current))  116                 {  117                     throw new Exception("No possibility, something fucked up...");  118                 }  119  120             }  121  122             // last byte, put extentions into Extentions.  123             if (depth == (data.Length - 1))  124             {  125                 if (current.Extentions == null)  126                 {  127                     current.Extentions = new List<string>(4);  128                 }  129  130                 current.Extentions.AddRange(extentions);  131                 return;  132             }  133  134             Add(data, current, extentions, depth + 1);  135         }  136  137         private void Match(byte[] data, int depth, Node node, List<string> extentionStore, bool matchAll)  138         {  139             // if depth out of data.Length's index then data end.  140             if (data.Length == depth)  141             {  142                 return;  143             }  144  145             node.Children.TryGetValue(data[depth], out Node current);  146  147             // can't find matched node, match ended.  148             if (current == null)  149             {  150                 return;  151             }  152  153             // now extentions not null, this node is a final node and this is a result.  154             if (current.Extentions != null)  155             {  156                 extentionStore.AddRange(current.Extentions);  157  158                 // if only match first matched.  159                 if (!matchAll)  160                 {  161                     return;  162                 }  163             }  164  165             // children is null, match ended.  166             if (current.Children == null)  167             {  168                 return;  169             }  170  171             // children not null, keep match.  172             Match(data, depth + 1, current, extentionStore, matchAll);  173         }  174     }  175 }

Sniffer

  好了,上面就是幾個核心的類,現在講解一下核心源碼。類似這樣的NUnit程式碼

 1 [TestMethod]   2         public void SnifferTest()   3         {   4             var sniffer = new Sniffer();   5             sniffer.Populate(FileTypes.Common);   6             sniffer.Populate(FileTypes.Unfrequent);   7             var head = new byte[]   8             {   9                 0xff,  10                 0xd8,  11                 0xff,  12                 0xdb  13             };  14  15             var result = sniffer.Match(head,true);  16  17             Assert.IsTrue(result.Contains("jpg"));  18             Assert.IsTrue(result.Contains("jpeg"));  19         }

Demo

  var sniffer = new Sniffer();這個是定義和初始化嗅探器,在這個過程中會使用Node類初始化樹,這個樹用於存儲所有FileTypes下定義的節點。

1 public Sniffer()  2         {  3             _root = new Node()  4             {  5                 Children = new SortedList<byte, Node>(128),  6                 Depth = -1,  7             };  8             ComplexMetadata = new List<Metadata>(10);  9         }

初始化

  sniffer.Populate(FileTypes.Common);這個作用是生成樹。生成過程是一個遞歸演算法

 1 public static void Populate(this Sniffer sniffer, IList<Record> records)   2         {   3             foreach (var record in records)   4             {   5                 sniffer.Add(record);   6             }   7         }   8   9 public void Add(Record record)  10         {  11             if (record.IsComplexMetadata)  12             {  13                 ComplexMetadata.Add(record);  14             }  15             else  16             {  17                 Add(record.Hex.GetByte(), record.Extentions.Split(',', ' '));  18             }  19         }  20  21 public void Add(byte[] data, string[] extentions)  22         {  23             Add(data, _root, extentions, 0);  24         }  25  26 private void Add(byte[] data, Node parent, string[] extentions, int depth)  27         {  28             Node current = null;  29  30             if (parent.Children == null)  31             {  32                 parent.Children = new SortedList<byte, Node>(Convert.ToInt32(128 / Math.Pow(2, depth)));  33             }  34  35             // if not contains current byte index, create node and put it into children.  36             if (!parent.Children.ContainsKey(data[depth]))  37             {  38                 current = new Node  39                 {  40                     Depth = depth,  41                     Parent = parent  42                 };  43                 parent.Children.Add(data[depth], current);  44             }  45             else  46             {  47                 if (!parent.Children.TryGetValue(data[depth], out current))  48                 {  49                     throw new Exception("No possibility, something fucked up...");  50                 }  51  52             }  53  54             // last byte, put extentions into Extentions.  55             if (depth == (data.Length - 1))  56             {  57                 if (current.Extentions == null)  58                 {  59                     current.Extentions = new List<string>(4);  60                 }  61  62                 current.Extentions.AddRange(extentions);  63                 return;  64             }  65  66             Add(data, current, extentions, depth + 1);  67         }

生成樹

  生成樹過程是首選把FileTypes.Common中預先定義Record列表的hex通過空格和逗號分隔,然後轉成10進位,例如new Record(“ogg oga ogv”, “4F 67 67 53”)。hex部分就是 “4F 67 67 53″,轉換為10進位就是“79 103 103 83”,然後把這數字串生成樹,生成圖如下:

   如果再加上一個Record節點,如果有相同順序部分,就使用相同的節點,例如再有一個路徑是new Record(“XXX”, “4F 67 01”),此時樹節點如下:

 

   明白了上面樹結構,就很容易知道這樹的作用了,就是為了提高查詢的速度的。var result = sniffer.Match(head,true);就可以通過樹的路徑找到文件類型,即葉子節點對應的類型屬性。

  好了,這個工具其實原理很簡單,就是使用的樹結構提高了查詢效率而已。

 

 

嗅探器