自己做Fiddler,实现http网站的抓取

---恢复内容开始---

由于工作需要,需要做一个小程序,实现网站监控,当浏览器浏览到目标网站的时候,实现爬取数据。由于https存在证书验证,需要别的方式来,暂时还没研究,但必须能正常访问。

官方的Demo,我在这个基础上进行了修改。http://fiddler.wikidot.com/fiddlercore-demo

这边我来介绍下我的Demo。

首先要去下载FiddlerCore4.Dll,百度上很多。

先上代码:

主界面代码

using System;
using System.Collections.Generic;
using System.IO;
using System.Reflection;
using System.Threading;
using Fiddler;
using System.Net;
using System.Text.RegularExpressions;
using System.Text;
using System.Diagnostics;
using System.Runtime.InteropServices;

namespace FiddlerDemo
{
    class Program
    {
        public static Proxy oProxy;
        //static string sSecureEndpointHostname = "cd.chnai.com";
        //static int iSecureEndpointPort = 7777;
       
        #region 控制台异常关闭时,程序不影响电脑
        public delegate bool ControlCtrlDelegate(int CtrlType);
        [DllImport("kernel32.dll")]
        private static extern bool SetConsoleCtrlHandler(ControlCtrlDelegate HandlerRoutine, bool Add);
        private static ControlCtrlDelegate cancelHandler = new ControlCtrlDelegate(HandlerRoutine);

        public static bool HandlerRoutine(int CtrlType)
        {
            switch (CtrlType)
            {
                case 0:
                    DoQuit(); //Ctrl+C关闭 
                    break;
                case 2:
                    //Console.WriteLine("2工具被强制关闭");//按控制台关闭按钮关闭 
                    DoQuit();
                    break;
            }
            return false;
        }
        #endregion
        static void Main(string[] args)
        {
            SetConsoleCtrlHandler(cancelHandler, true);
            FiddlerHelp fh = new FiddlerHelp();
            fh.StartSession();
        }
        static void Console_CancelKeyPress(object sender, ConsoleCancelEventArgs e)
        {
            DoQuit();
        }
        public static void DoQuit()
        {
            WriteCommandResponse("Shutting down...");
            if (null != oProxy) oProxy.Dispose();
            Fiddler.FiddlerApplication.Shutdown();
            Thread.Sleep(500);
        }
        public static void WriteCommandResponse(string s)
        {
            ConsoleColor oldColor = Console.ForegroundColor;
            Console.ForegroundColor = ConsoleColor.Yellow;
            Console.WriteLine(s);
            Console.ForegroundColor = oldColor;
        }
    }
}

处理类:

using Fiddler;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Configuration;
using System.Xml;


namespace FiddlerDemo
{
    public class HtmlConfig
    {
        public string sWeb { get; set; }
        public string sRegex { get; set; }
    }
    public class FiddlerHelp
    {
        public static Proxy oProxy;
        public static List<HtmlConfig> dicHtml = new List<HtmlConfig> { };
     
        public void StartSession()
        {
            LoadHtmlConfig();
            //设置别名
            Fiddler.FiddlerApplication.SetAppDisplayName("FiddlerCoreDemoApp");

            List<Fiddler.Session> oAllSessions = new List<Fiddler.Session>();

            Fiddler.FiddlerApplication.OnNotification += delegate(object sender, NotificationEventArgs oNEA) { Console.WriteLine("** NotifyUser: " + oNEA.NotifyString); };

            Fiddler.FiddlerApplication.BeforeRequest += delegate(Fiddler.Session oS)
            {
                // 为了使反应篡改,必须使用缓冲模式
                // 被启用。这允许FiddlerCore以允许修改
                // 在BeforeResponse处理程序中的反应,而不是流
                // 响应给客户机作为响应进来。
                oS.bBufferResponse = true;
                Monitor.Enter(oAllSessions);
                oAllSessions.Add(oS);
                Monitor.Exit(oAllSessions);
            };
           
            // 下面的事件,您可以检查由Fiddler阅读每一响应缓冲区。  
            // 请注意,这不是为绝大多数应用非常有用,因为原始缓冲区几乎是无用的;它没有解压,它包括标题和正文字节数等。
            // 本次仅适用于极少数的应用程序这就需要一个原始的,未经处理的字节流获取有用
            //Fiddler.FiddlerApplication.OnReadResponseBuffer += new EventHandler<RawReadEventArgs>(FiddlerApplication_OnReadResponseBuffer);
            Fiddler.FiddlerApplication.BeforeResponse += delegate(Fiddler.Session oS)
            {
                //HTTP响应,并随后修改任何HTTP响应,以取代
                oS.utilDecodeResponse();
                foreach (var item in dicHtml)
                {
                    if (oS.fullUrl.Contains(item.sWeb))
            
{ Console.WriteLine("{0}:HTTP {1} for {2}", oS.id, oS.responseCode, oS.fullUrl); string sHtmlBody = oS.GetResponseBodyAsString(); if(!string.IsNullOrEmpty(sHtmlBody)) { Console.Write("获取的内容为:"+MatchRegex(sHtmlBody,item.sRegex) + "\n"); } } } Monitor.Enter(oAllSessions); oAllSessions.Add(oS); Monitor.Exit(oAllSessions); // 内容:{3} , oS.GetResponseBodyEncoding().GetString(oS.responseBodyBytes) //Console.WriteLine("{0}:HTTP {1} for {2}", oS.id, oS.responseCode, oS.fullUrl); }; Fiddler.FiddlerApplication.AfterSessionComplete += delegate(Fiddler.Session oS) { Console.Title = ("Session list contains: " + oAllSessions.Count.ToString() + " sessions");
          //数量大于1000条时进行清空
                if (oAllSessions.Count > 1000)
                {
                    Monitor.Enter(oAllSessions);
                    oAllSessions.Clear();
                    Monitor.Exit(oAllSessions);
                } }; Console.CancelKeyPress
+= new ConsoleCancelEventHandler(Console_CancelKeyPress); FiddlerApplication.Prefs.SetBoolPref("fiddler.network.streaming.abortifclientaborts", true); //启动方式 //FiddlerCoreStartupFlags oFCSF = FiddlerCoreStartupFlags.Default; Fiddler.CONFIG.IgnoreServerCertErrors = false; int iPort = 8877; //Fiddler.FiddlerApplication.Startup(iPort, oFCSF); Fiddler.FiddlerApplication.Startup(iPort, true, false, true); bool bDone = false; #region 各种操作 do { Console.WriteLine("\nEnter a command [C=Clear; L=List; G=Collect Garbage; W=write SAZ; R=read SAZ;\n\tS=Toggle Forgetful Streaming; T=Trust Root Certificate; Q=Quit]:>"); ConsoleKeyInfo cki = Console.ReadKey(); Console.WriteLine(); switch (Char.ToLower(cki.KeyChar)) { case 'c': Monitor.Enter(oAllSessions); oAllSessions.Clear(); Monitor.Exit(oAllSessions); WriteCommandResponse("Clear..."); FiddlerApplication.Log.LogString("Cleared session list."); break; case 'd': FiddlerApplication.Log.LogString("FiddlerApplication::Shutdown."); FiddlerApplication.Shutdown(); break; //case 'l': // WriteSessionList(oAllSessions); // break; case 'g': Console.WriteLine("Working Set:\t" + Environment.WorkingSet.ToString("n0")); Console.WriteLine("Begin GC..."); GC.Collect(); Console.WriteLine("GC Done.\nWorking Set:\t" + Environment.WorkingSet.ToString("n0")); break; case 'q': bDone = true; DoQuit(); break; case 'r': WriteCommandResponse("This demo was compiled without SAZ_SUPPORT defined"); break; case 'w': WriteCommandResponse("This demo was compiled without SAZ_SUPPORT defined"); break; case 't': try { WriteCommandResponse("Result: " + Fiddler.CertMaker.trustRootCert().ToString()); } catch (Exception eX) { WriteCommandResponse("Failed: " + eX.ToString()); } break; // Forgetful streaming case 's': bool bForgetful = !FiddlerApplication.Prefs.GetBoolPref("fiddler.network.streaming.ForgetStreamedData", false); FiddlerApplication.Prefs.SetBoolPref("fiddler.network.streaming.ForgetStreamedData", bForgetful); Console.WriteLine(bForgetful ? "FiddlerCore will immediately dump streaming response data." : "FiddlerCore will keep a copy of streamed response data."); break; } } while (!bDone); #endregion } /// <summary> /// 通过读取xml内的配置来获取监听的网站和获取数据的正则表达式 /// </summary> private void LoadHtmlConfig() { try { XmlDocument xmlDoc = new XmlDocument();  string sPath = string.Empty;
                if (File.Exists(@"..\..\WatchHtml.xml"))
                {
                    //调试目录
                    sPath = @"..\..\WatchHtml.xml";
                }
                else
                {
                    //编译目录下
                    sPath = @"WatchHtml.xml";
                }
XmlNode xn = xmlDoc.SelectSingleNode("Root"); XmlNodeList xnl = xn.ChildNodes; foreach (XmlNode item in xnl) { XmlElement xe = (XmlElement)item; HtmlConfig htmlConfig = new HtmlConfig { sWeb = xe.GetAttribute("Web").ToString(), sRegex = xe.GetAttribute("Regex").ToString() }; dicHtml.Add(htmlConfig); } } catch (Exception) { throw; } } public void Console_CancelKeyPress(object sender, ConsoleCancelEventArgs e) { DoQuit(); } /// <summary> /// 退出 /// </summary> public void DoQuit() { if (null != oProxy) oProxy.Dispose(); Fiddler.FiddlerApplication.Shutdown(); Thread.Sleep(500); } public static void WriteCommandResponse(string s) { ConsoleColor oldColor = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(s); Console.ForegroundColor = oldColor; } /// <summary> /// /// </summary> /// <param name="sHtml">获得的Html页面</param> /// <param name="sRegex">正则表达式</param> /// <returns></returns> public static string MatchRegex(string sHtml,string sRegex) { string sResult = string.Empty; try { if (string.IsNullOrEmpty(sHtml)) return null; var result = Regex.Match(sHtml.Replace('\r', ' ').Replace('\n', ' ').Trim(), sRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (result.Success) { sResult = result.Value; } return sResult; } catch (Exception) { return null; } } } }

XML文件内容

实现效果

 

 

 

 

 

 

---恢复内容结束---

posted @ 2017-06-15 12:02  在今朝  阅读(4456)  评论(0编辑  收藏  举报