爬蟲系列 一次採集.NET WebForm網站的坎坷歷程
今天接到一個活,需要統計人員的工號資訊,由於種種原因不能直接連資料庫 [無奈]、[無奈]、[無奈]。採取迂迴方案,寫個工具自動登錄網站,採集用戶資訊。
這也不是第一次採集ASP.NET網站,以前採集的時候就知道,這種網站採集比較麻煩,尤其是WebForm的ASP.NET 網站,那叫一個費勁。
喜歡現在流行的Restful模式的網站,數據介面採集那才叫舒服。
閑話少說,開干
工作量不大,HTTP純手寫
先準備下一個GET/POST預備使用
public static string Get(string url, Action<string> SuccessCallback, Action<string> FailCallback) { HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest; req.Method = "GET"; req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"; req.Accept = "*/*"; req.KeepAlive = true; req.ServicePoint.ConnectionLimit = int.MaxValue; req.ServicePoint.Expect100Continue = false; req.CookieContainer = sznyCookie; #靜態變數 req.Credentials = System.Net.CredentialCache.DefaultCredentials; string msg = ""; using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse) { using (StreamReader reader = new StreamReader(rsp.GetResponseStream())) { msg = reader.ReadToEnd(); } } return msg; } public static string Post(string url, Dictionary<string, string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) { StringBuilder data = new StringBuilder(); foreach (var kv in dicParms) { if (kv.Key.StartsWith("header")) continue; data.Append($"&{Common.UrlEncode( kv.Key,Encoding.UTF8)}={ Common.UrlEncode( kv.Value,Encoding.UTF8)}"); } if (data.Length > 0) data.Remove(0, 1); HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest; req.Method = "POST"; req.KeepAlive = true; req.CookieContainer = sznyCookie; req.Connection = "KeepAlive"; req.KeepAlive = true; req.ContentType = "application/x-www-form-urlencoded"; req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"; req.Referer = url; if (dicParms.ContainsKey("ScriptManager1")) { req.Headers.Add("X-MicrosoftAjax", "Delta=true"); req.Headers.Add("X-Requested-With", "XMLHttpRequest"); req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8"; req.Accept = "*/*"; } req.Headers.Add("Cache-Control", "no-cache"); req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"; req.ServicePoint.ConnectionLimit = int.MaxValue; req.ServicePoint.Expect100Continue = false; req.AllowAutoRedirect = true; req.Credentials = System.Net.CredentialCache.DefaultCredentials; byte[] buffer = Encoding.UTF8.GetBytes(data.ToString()); using (Stream reqStream = req.GetRequestStream()) { reqStream.Write(buffer, 0, buffer.Length); } string msg = ""; using (HttpWebResponse rsp = req.GetResponse() as HttpWebResponse) { using (StreamReader reader = new StreamReader(rsp.GetResponseStream())) { msg = reader.ReadToEnd(); if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx")) { //登錄失敗 if (FailCallback != null) FailCallback(msg); } else { if (SuccessCallback!=null) SuccessCallback(msg); } } } return msg; }
整個過程分為登陸、用戶資訊列表、用戶資訊詳情,分三步走來完成這個項目
登陸
根據Chrome抓包結果編寫Login,帳號密碼沒有任何加密,直接明文顯示了,直接用了,根據是否跳轉頁面判斷是否登陸成功。調試查看結果登陸成功了。
根據上面的抓包數據,可以調用下面的程式碼確定是否登陸成功。
public static bool SznyLogin(string username, string password, Action<string> SuccessCallback, Action<string> FailCallback) { string url = "//127.0.0.1/login.aspx"; string msg = Get(url, SuccessCallback, FailCallback); if (msg.Trim().Length > 0) { Dictionary<string, string> dicParms = new Dictionary<string, string>(); dicParms.Add("__VIEWSTATE", ""); dicParms.Add("__EVENTVALIDATION", ""); dicParms.Add("Text_Name", ""); dicParms.Add("Text_Pass", ""); dicParms.Add("btn_Login.x", new Random().Next(100).ToString()); dicParms.Add("btn_Login.y", new Random().Next(200).ToString()); MatchCollection mc = Regex.Matches(msg, @"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim())) dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim(); } dicParms["Text_Name"] = username; dicParms["Text_Pass"] = password; msg=Post(url, dicParms, SuccessCallback, FailCallback); if (msg.Contains("images/dl.jpg") || msg.Contains("pageRedirect||%2flogin.aspx")) { return false; } else return true; } return false; }
抓取人員資訊
看到下面這個頁面,失望了,列表上沒有工號,如果列表上有工號 設置一頁顯示全部資訊就可以把所有的數據都抓取到了。
換個思路:是不是我直接設置一頁顯示所有的數據後,然後根據員工ID可以獲取到所有的資訊呢?
接下來點擊任意一條資訊後,查看詳情,顯示下面的調用結果。Url上沒有ID,Get這條路走不通了,查看Post的數據,更失望,沒有ID,通過行資訊綁定。傳統的WebForm 提交模式…
把所有的數據顯示到一頁,把列表的數據先採集完,然後最後一個頁面一個頁面的採集工號資訊。
public static CookieContainer sznyCookie = new CookieContainer(); /// <summary> /// 員工資訊 /// </summary> public static Dictionary<int, Dictionary<string,string>> dicSznyEmployees = new Dictionary<int, Dictionary<string, string>>(); public static Dictionary<string, string> dicSznyEmployeeParms = new Dictionary<string, string>(); /// <summary> /// 人員順序號 /// </summary> public static ConcurrentQueue<int> queueSznyEmployeeInfo = new ConcurrentQueue<int>(); public static ConcurrentQueue<int> queueSuccessEmployeeInfo = new ConcurrentQueue<int>(); public static bool SznyEmployeeList(Action<string> SuccessCallback, Action<string> FailCallback) { string url = $"//127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx"; string msg = Get(url, SuccessCallback, FailCallback); if (msg.Trim().Length > 100) { //統計參數 //__doPostBack\('(?<name>[^']*?)' //new Regex(@"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) string name = ""; MatchCollection mc = Regex.Matches(msg, @"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { name = mi.Groups["name"].Value.Trim(); break; } //(?<=<a[^<>]*?href="javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)"[^<>]*?>條/頁) //new Regex(@"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) string smname = ""; Match m = Regex.Match(msg, @"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); if (m.Success) smname = m.Value.Trim().Replace("'", "").Replace("'", ""); //<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?value="(?<val>[^"]*?)"[^<>]*?/?>|<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>|<select[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?> //new Regex(@"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) Dictionary<string, string> dicParms = new Dictionary<string, string>(); dicParms.Add("ScriptManager1", $"UpdatePanel1|{smname}"); dicParms.Add("__EVENTTARGET", smname); dicParms.Add("__EVENTARGUMENT", ""); dicParms.Add("__VIEWSTATE", ""); dicParms.Add("__EVENTVALIDATION", ""); dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0"); dicParms.Add("XM", "ZXMCHECK"); List<string> lstParms = new List<string>() { "XM", "MdGridView_t_unitemployees_dwyg_iCurrentPage", "MdGridView_t_unitemployees_dwyg_GridViewID", "MdGridView_t_unitemployees_dwyg_iCurrentNum", "MdGridView_t_unitemployees_dwyg_iPageCount", "MdGridView_t_unitemployees_dwyg_iPageSize", "Button_Query", "__EVENTTARGET", "__EVENTARGUMENT", "Button_SelQuery", "Button_view", "Button_edit", "Button_out", "ImageButton_Tx", "ImageButton_xx1", "Button_qd", "MdGridView_t_unitemployees_dwyg_GridViewID", "__ASYNCPOST", "MdGridView_t_unitemployees_dwyg__PageSetText" }; mc = Regex.Matches(msg, @"<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?value=""(?<val>[^""]*?)""[^<>]*?/?>|<input[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>|<select[^<>]*?name=""(?<name>[^""]*?)""[^<>]*?""[^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { if (lstParms.Contains(mi.Groups["name"].Value.Trim())) continue; if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim())) dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim(); else dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim()); } if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_PageSetText")) dicParms["MdGridView_t_unitemployees_dwyg$_PageSetText"] = "1200"; else dicParms.Add("MdGridView_t_unitemployees_dwyg$_PageSetText", "100");//1200條 每頁 msg = Post(url, dicParms, SuccessCallback, FailCallback); dicSznyEmployees.Clear(); dicSznyEmployeeParms.Clear(); dicSznyEmployeeParms.Clear(); dicSznyEmployeeParms.Add("__EVENTTARGET", ""); dicSznyEmployeeParms.Add("__EVENTARGUMENT", ""); dicSznyEmployeeParms.Add("__VIEWSTATE", dicParms["__VIEWSTATE"]); dicSznyEmployeeParms.Add("__EVENTVALIDATION", dicParms["__EVENTVALIDATION"]); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}"); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", "1"); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1"); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1"); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1"); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1"); dicSznyEmployeeParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0"); dicSznyEmployeeParms.Add("XM", "ZXMCHECK"); lstParms.Clear(); lstParms = new List<string>() { "XM", "__EVENTTARGET", "__EVENTARGUMENT", "Button_Query", "Button_SelQuery", }; lstParms.Add("Button_edit"); lstParms.Add("Button_out"); lstParms.Add("ImageButton_Tx"); lstParms.Add("ImageButton_xx1"); lstParms.Add("Button_qd"); lstParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID"); lstparms.add("mdgridview_t_unitemployees_dwyg_icurrentpage"); lstparms.add("mdgridview_t_unitemployees_dwyg_itotalpage"); lstparms.add("mdgridview_t_unitemployees_dwyg_itotalcount"); lstparms.add("mdgridview_t_unitemployees_dwyg_ipagesize"); lstparms.add("mdgridview_t_unitemployees_dwyg_ipagecount"); lstParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum"); mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { if (lstParms.Contains(mi.Groups["name"].Value.Trim())) continue; if (dicSznyEmployeeParms.ContainsKey(mi.Groups["name"].Value.Trim())) dicSznyEmployeeParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim(); else dicSznyEmployeeParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim()); } int cnt = int.Parse(dicSznyEmployeeParms["MdGridView_t_unitemployees_dwyg_iTotalCount"]); for (int i = 1; i <= cnt; i++) queueSznyEmployeeInfo.Enqueue(i); //獲取TR //< tr[^<>] *? name = "SelectTR"[^<>] *?>.*?</ tr > //new Regex(@"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) mc = Regex.Matches(msg, @"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { //獲取td //(?<=<td[^<>]*?>).*?(?=</td>) //new Regex("(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) MatchCollection mic = Regex.Matches(mi.Value, "(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); int ix = int.Parse(mic[1].Value.Trim()); if (!dicSznyEmployees.ContainsKey(ix)) { dicSznyEmployees.Add(ix, new Dictionary<string, string>()); } queueSznyEmployeeInfo.Enqueue(ix); dicSznyEmployees[ix].Add("UserName", mic[2].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("PersonID", mic[3].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("Birthday", mic[4].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("Sex", mic[5].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("HomePhone", mic[6].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("TelPhone", mic[7].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("Mail", mic[8].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("Address", mic[9].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("MinZu", mic[10].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("AddressJiGuan", mic[11].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("ZhengZhiMianmao", mic[12].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("Paiqianshijian", mic[13].Value.Trim().Replace(" ", "")); dicSznyEmployees[ix].Add("Remark", mic[14].Value.Trim().Replace(" ", "")); } } return true; }
這樣所有的人員資訊一次性採集到靜態變數字典中了,剩下的一個工號可以慢慢獲取了。
既然是這樣,老實的分析Post數據,按照格式Post數據把。
分析完Post的數據後,突發奇想,我是不是可以通過相同的__ViewState和__EVENTVALIDATION POST數據呢?說干就干。
寫程式碼跳轉到員工列表頁面,然後POST數據設置一頁顯示所有數據。
所有的POST的參數,保存到一個靜態變數中。
發現POST批量提交的時候,前3次正常,以後就直接未登錄。
果斷放棄,換思路。
那如果這樣不行 可不可以把所有的數據放到一個頁面上,然後每次獲取一次頁面,然後根據順序號POST數據呢。
上面已經把所有的列表數據都採集完了,順序號也固定了,然後在POST數據的時候,發現有的人員和工號不對應。
這時候去分析為什麼數據會出現不對應的情況呢?發現正則表達式寫的還有問題。獲取頁面的Input的時候,屬性有可能使用雙引號,也有可能使用單引號。
正則表達式由原來的
<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?value="(?<val>[^"]*?)"[^<>]*?/?>|<input[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>|<select[^<>]*?name="(?<name>[^"]*?)"[^<>]*?"[^<>]*?/?>
修改為
<input[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?value=["'](?<val>[^'"]*?)["'][^<>]*?/?>|<input[^<>]*?name=["'](?<name>[^'"]*?)["'][^<>]*?["'][^<>]*?/?>|<select[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?["'][^<>]*?/?>
由於網站非同步提交,也就是以前WEBForm採用的ScriptManager,提交的時候返回的HTML不是整個Document,沒有注意,以為沒有返回__ViewState。所以採用GET的時候獲取的__ViewState繼續執行獲取工號的操作。發現獲取的工號都是錯誤,人員與工號對不上
麻爪了,不知道該咋辦了。猶豫了一下後,上Fiddler吧,一點點的看提交的參數是否有區別。發現正常網站在Get到頁面後,通過調整每頁x條數據後,提交的ViewState與原來的不一致。尋尋覓覓 覓覓尋尋 最後發現非同步返回的HTML中,最後有ViewState….
由於返回的數據順序,每次也不一樣,也是造成人員、工號不一致的原因。
提交後正常了,但是1000多條的員工資訊,每次提交都是2000多個參數。看著冗長的POST數據,無語了。這樣提交 先不說網站本身就慢。我提交這麼多網站會不會更慢,我的系統是不是也會更慢。
怎麼辦?
是不是有可能把分頁設置成每頁只有一條數據,然後每次翻頁,採集數據。簡單試試把
先修改獲取列表頁面數據,把數據設置成一條每頁,此時不再採集列表中的資訊。而是記錄總共多少頁,放入隊列中,共定時任務去分頁採集數據。列表資訊通過後面的分頁數據採集。
由於網站是內部系統,為了不影響系統的正常運行,每次只採集一條資訊,等待這條資訊採集完成後,在採集下一頁資訊。
採集列表
public static void ReqSznyEmployeeList(int ix,Action<string> SuccessCallback, Action<string> FailCallback) { string url = $"//127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx"; HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest; req.Method = "GET"; req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"; req.Accept = "*/*"; req.KeepAlive = true; req.ServicePoint.ConnectionLimit = int.MaxValue; req.ServicePoint.Expect100Continue = false; req.CookieContainer = sznyCookie; req.Credentials = System.Net.CredentialCache.DefaultCredentials; req.BeginGetResponse(new AsyncCallback(RspSznyEmployeeList), new object[] { req, url,ix, SuccessCallback, FailCallback }); } private static void RspSznyEmployeeList(IAsyncResult result) { object[] parms = result.AsyncState as object[]; HttpWebRequest req = parms[0] as HttpWebRequest; string url = parms[1].ToString(); int ix = int.Parse(parms[2].ToString()); Action<string> SuccessCallback = parms[3] as Action<string>; Action<string> FailCallback = parms[4] as Action<string>; try { using (HttpWebResponse rsp = req.EndGetResponse(result) as HttpWebResponse) { using (StreamReader reader = new StreamReader(rsp.GetResponseStream())) { string msg = ""; msg = reader.ReadToEnd(); //統計參數 //__doPostBack\('(?<name>[^']*?)' //new Regex(@"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) string name = ""; MatchCollection mc = Regex.Matches(msg, @"__doPostBack\('(?<name>[^']*?)'", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { name = mi.Groups["name"].Value.Trim(); break; } //(?<=<a[^<>]*?href="javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)"[^<>]*?>條/頁) //new Regex(@"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) string smname = "MdGridView_t_unitemployees_dwyg$_SearchGo"; //Match m = Regex.Match(msg, @"(?<=<a[^<>]*?href=""javascript:__dopostback\()[^<>]*?(?=,[^<>]*?\)""[^<>]*?>條/頁)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); //<input[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?value=["'](?<val>[^'"]*?)["'][^<>]*?/?>|<input[^<>]*?name=["'](?<name>[^'"]*?)["'][^<>]*?["'][^<>]*?/?>|<select[^<>]*?name=["'](?<name>[^"']*?)["'][^<>]*?["'][^<>]*?/?> //new Regex(@"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) Dictionary<string, string> dicParms = new Dictionary<string, string>(); dicParms.Add("ScriptManager1", $"UpdatePanel1|{smname}"); dicParms.Add("__EVENTTARGET", smname); dicParms.Add("__EVENTARGUMENT", ""); dicParms.Add("__VIEWSTATE", ""); dicParms.Add("__EVENTVALIDATION", ""); dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", ix.ToString()); dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0"); //dicParms.Add("MdGridView_t_unitemployees_dwyg$_SearchTextBox", ix.ToString()); dicParms.Add("XM", "ZXMCHECK"); List<string> lstParms = new List<string>() { "ScriptManager1", "XM", "MdGridView_t_unitemployees_dwyg_iCurrentNum", "Button_Query", "__EVENTTARGET", "__EVENTARGUMENT", "Button_SelQuery", "Button_view", "Button_edit", "Button_out", "ImageButton_Tx", "ImageButton_xx1", "Button_qd", "__ASYNCPOST", "MdGridView_t_unitemployees_dwyg__PageSetText" }; mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { if (lstParms.Contains(mi.Groups["name"].Value.Trim())) continue; if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim())) dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim(); else dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim()); } if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_PageSetText")) dicParms["MdGridView_t_unitemployees_dwyg$_PageSetText"] = "1"; else dicParms.Add("MdGridView_t_unitemployees_dwyg$_PageSetText", "1");//1200條 每頁 if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg_iPageCount")) dicParms["MdGridView_t_unitemployees_dwyg_iPageCount"] = "1"; else dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1"); if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg_iPageSize")) dicParms["MdGridView_t_unitemployees_dwyg_iPageSize"] = "1"; else dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1"); if (dicParms.ContainsKey("MdGridView_t_unitemployees_dwyg$_SearchTextBox")) dicParms["MdGridView_t_unitemployees_dwyg$_SearchTextBox"] = $"{ix}"; else dicParms.Add("MdGridView_t_unitemployees_dwyg$_SearchTextBox", $"{ix}");/*第幾頁*/ dicParms["MdGridView_t_unitemployees_dwyg_iTotalPage"] = dicParms["MdGridView_t_unitemployees_dwyg_iTotalCount"]; msg = Post(url, dicParms, SuccessCallback, FailCallback); //獲取TR //<tr[^<>]*?name="SelectTR"[^<>]*?>.*?</tr> //new Regex(@"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) mc = Regex.Matches(msg, @"<tr[^<>]*?name=""SelectTR""[^<>]*?>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { //獲取td //(?<=<td[^<>]*?>).*?(?=</td>) //new Regex("(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) MatchCollection mic = Regex.Matches(mi.Value, "(?<=<td[^<>]*?>).*?(?=</td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); if (!dicSznyEmployees.ContainsKey(ix)) { dicSznyEmployees.Add(ix, new Dictionary<string, string>()); } //queueSznyEmployeeInfo.Enqueue(ix); if (!dicSznyEmployees[ix].ContainsKey("UserName")) dicSznyEmployees[ix].Add("UserName", mic[2].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["UserName"] = mic[2].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("PersonID")) dicSznyEmployees[ix].Add("PersonID", mic[3].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["PersonID"] = mic[3].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("Birthday")) dicSznyEmployees[ix].Add("Birthday", mic[4].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["Birthday"] = mic[4].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("Sex")) dicSznyEmployees[ix].Add("Sex", mic[5].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["Sex"] = mic[5].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("HomePhone")) dicSznyEmployees[ix].Add("HomePhone", mic[6].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["HomePhone"] = mic[6].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("TelPhone")) dicSznyEmployees[ix].Add("TelPhone", mic[7].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["TelPhone"] = mic[7].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("Mail")) dicSznyEmployees[ix].Add("Mail", mic[8].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["Mail"] = mic[8].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("Address")) dicSznyEmployees[ix].Add("Address", mic[9].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["Address"] = mic[9].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("MinZu")) dicSznyEmployees[ix].Add("MinZu", mic[10].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["MinZu"] = mic[10].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("AddressJiGuan")) dicSznyEmployees[ix].Add("AddressJiGuan", mic[11].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["AddressJiGuan"] = mic[11].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("ZhengZhiMianmao")) dicSznyEmployees[ix].Add("ZhengZhiMianmao", mic[12].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["ZhengZhiMianmao"] = mic[12].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("Paiqianshijian")) dicSznyEmployees[ix].Add("Paiqianshijian", mic[13].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["Paiqianshijian"] = mic[13].Value.Trim().Replace(" ", ""); if (!dicSznyEmployees[ix].ContainsKey("Remark")) dicSznyEmployees[ix].Add("Remark", mic[14].Value.Trim().Replace(" ", "")); else dicSznyEmployees[ix]["Remark"] = mic[14].Value.Trim().Replace(" ", ""); } dicParms.Clear(); mc = Regex.Matches(msg, @"(?<name>__VIEWSTATE)\|(?<v>[^\|]+)|(?<name>__EVENTVALIDATION)\|(?<v>[^\|]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["v"].Value.Trim()); } dicParms.Add("HiddenField_param", ""); dicParms.Add("__EVENTTARGET", ""); dicParms.Add("__EVENTARGUMENT", ""); dicParms.Add("MdGridView_t_unitemployees_dwyg_GridViewID", $"{name}"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentPage", ix.ToString()); dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalPage", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iTotalCount", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageSize", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iPageCount", "1"); dicParms.Add("MdGridView_t_unitemployees_dwyg_iCurrentNum", "0"); dicParms.Add("XM", "ZXMCHECK"); lstParms.Clear(); lstParms = new List<string>() { "XM", "__EVENTTARGET", "__EVENTARGUMENT", "Button_Query", "Button_SelQuery", }; lstParms.Add("Button_edit"); lstParms.Add("Button_out"); lstParms.Add("ImageButton_Tx"); lstParms.Add("ImageButton_xx1"); lstParms.Add("Button_qd"); mc = Regex.Matches(msg, @"<input[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?value=[""'](?<val>[^'""]*?)[""'][^<>]*?/?>|<input[^<>]*?name=[""'](?<name>[^'""]*?)[""'][^<>]*?[""'][^<>]*?/?>|<select[^<>]*?name=[""'](?<name>[^""']*?)[""'][^<>]*?[""'][^<>]*?/?>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); foreach (Match mi in mc) { if (lstParms.Contains(mi.Groups["name"].Value.Trim())) continue; if (dicParms.ContainsKey(mi.Groups["name"].Value.Trim())) dicParms[mi.Groups["name"].Value.Trim()] = mi.Groups["val"].Value.Trim(); else dicParms.Add(mi.Groups["name"].Value.Trim(), mi.Groups["val"].Value.Trim()); } ReqSznyEmployeeInfo(ix, dicParms, SuccessCallback, FailCallback); } } } catch (Exception ex) { Business.queueSznyEmployeeInfo.Enqueue(ix); Business.queueMsg.Enqueue($"{DateTime.Now.ToString("yyy-MM-dd HH:mm:ss")}{ex.Message}"); } }
獲取工號
public static void ReqSznyEmployeeInfo(int ix,Dictionary<string,string> dicParms, Action<string> SuccessCallback, Action<string> FailCallback) { StringBuilder data = new StringBuilder(); foreach (var kv in dicParms) { if (kv.Key.StartsWith("header")) continue; data.Append($"&{Common.UrlEncode(kv.Key, Encoding.UTF8)}={ Common.UrlEncode(kv.Value, Encoding.UTF8)}"); } if (data.Length > 0) data.Remove(0, 1); HttpWebRequest req = WebRequest.Create("//127.0.0.1/HumanResources/EmployeeManage/EmployeeInfoList.aspx") as HttpWebRequest; req.Method = "POST"; req.KeepAlive = true; req.CookieContainer = sznyCookie; req.ContentType = "application/x-www-form-urlencoded"; req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"; if (dicParms.ContainsKey("ScriptManager1")) { req.Headers.Add("X-MicrosoftAjax", "Delta=true"); req.Headers.Add("X-Requested-With", "XMLHttpRequest"); req.ContentType = "application/x-www-form-urlencoded; charset=UTF-8"; req.Accept = "*/*"; } req.Headers.Add("Cache-Control", "max-age=0"); req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"; req.ServicePoint.ConnectionLimit = int.MaxValue; req.ServicePoint.Expect100Continue = false; req.AllowAutoRedirect = true; req.Credentials = System.Net.CredentialCache.DefaultCredentials; byte[] buffer = Encoding.UTF8.GetBytes(data.ToString()); using (Stream reqStream = req.GetRequestStream()) { reqStream.Write(buffer, 0, buffer.Length); } req.BeginGetResponse(new AsyncCallback(RspSznyEmployeeInfo), new object[] { req,ix, dicParms, SuccessCallback, FailCallback }); } private static void RspSznyEmployeeInfo(IAsyncResult result) { object[] parms = result.AsyncState as object[]; HttpWebRequest req = parms[0] as HttpWebRequest; int ix =int.Parse( parms[1].ToString()); Dictionary<string, string> dicParms = parms[2] as Dictionary<string, string>; Action<string> SuccessCallback = parms[3] as Action<string>; Action<string> FailCallback = parms[4] as Action<string>; try { using (HttpWebResponse rsp = req.EndGetResponse(result) as HttpWebResponse) { using (StreamReader reader = new StreamReader(rsp.GetResponseStream())) { string msg = ""; msg = reader.ReadToEnd(); string code = "無"; //<input[^<>]*?name\s*?=\s*?["']TextBox_YG_Code_str["'][^<>]*?value\s*?=\s*?["'](?<code>[^"']*?)["']|<input[^<>]*?value\s*?=\s*?["'](?<code>[^"']*?)["'][^<>]*?name\s*?=\s*?["']TextBox_YG_Code_str["'] //new Regex(@"<input[^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""'][^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""']|<input[^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""'][^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""']", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture) Match m = Regex.Match(msg, @"<input[^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""'][^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""']|<input[^<>]*?value\s*?=\s*?[""'](?<code>[^""']*?)[""'][^<>]*?name\s*?=\s*?[""']TextBox_YG_Code_str[""']", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); if (m.Success) code = m.Groups["code"].Value.Trim(); if (dicSznyEmployees[ix].ContainsKey("Code")) dicSznyEmployees[ix]["Code"] = code; else dicSznyEmployees[ix].Add("Code", code); queueSuccessEmployeeInfo.Enqueue(ix); } } } catch (Exception ex) { Business.queueSznyEmployeeInfo.Enqueue(ix); Business.queueMsg.Enqueue($"{DateTime.Now.ToString("yyy-MM-dd HH:mm:ss")}{ex.Message}"); } }
入庫
採集到的資訊,通過定時任務保存到資料庫。
Task.Factory.StartNew(() => { while (true) { if (Business.queueSuccessEmployeeInfo.Count <= 0) { Thread.Sleep(1000); continue; } List<Dictionary<string, string>> lst = new List<Dictionary<string, string>>(); while (Business.queueSuccessEmployeeInfo.Count > 0) { Business.queueSuccessEmployeeInfo.TryDequeue(out int ix); lst.Add(Business.dicSznyEmployees[ix]); if (lst.Count >= 50) break; } DbAccess.AddTran(lst, "SznyEmployee",new List<string>() { "UserName", "PersonID" }); Thread.Sleep(1); } });
總結
採集的時候,為了能利用已經採集到的資訊,而不是重複採集,在採集的時候對資料庫數據進行判斷是否存在。純粹是為了提高效率,WebForm的網站真是太慢,太慢了
以前寫非同步純粹是為了提高執行緒效率,在.NET中感覺不到快樂。
終於搞定了,數據已經成功入庫了。
.NET的沒落也是有原因的,網站的速度的確是慢,.net押寶.net core的新體驗了。
我討厭採集WEBForm網站,寫了這麼久的爬蟲,祈禱永遠不要在碰到WEBFORM了。
秀一下結果 慶祝一下把