本文是参照摩诘的Blog
今天遇到这样一个问题,从政府网站中,根据一个关键数据KeyData,提取相关数据。
这个问题可分为三部分解决:
1)取得政府网站交互的方法;
2)按照合适的方法,用HttpWebResponse,取得相关数据
3)分析取回来的数据
第一部分:获取网站交互信息,采用工具ieHTTPHeadersSetup.exe
得到的数据如下:
GET /search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20 HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*
Accept-Language: zh-cn
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)
Host: http://www.suzhou-logistics.com/
Connection: Keep-Alive
可以看出,
url: http://http://www.suzhou-logistics.com//search.asp?
Data:key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
也可以直接作为url:http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20
第二部分:根据第一部分的分析,通过HttpWebResponse取HTML
在此就给出一个通用的函数
 public static string GetPage(string url, string postData,string encodeType,out string err)
public static string GetPage(string url, string postData,string encodeType,out string err) 
 {
        { 
 Stream outstream = null;
            Stream outstream = null; 
 Stream instream = null;
            Stream instream = null; 
 StreamReader sr = null;
            StreamReader sr = null; 
 HttpWebResponse response = null;
            HttpWebResponse response = null; 
 HttpWebRequest request = null;
            HttpWebRequest request = null; 
 Encoding encoding = Encoding.GetEncoding(encodeType);
            Encoding encoding = Encoding.GetEncoding(encodeType); 
 byte[] data = encoding.GetBytes(postData);
            byte[] data = encoding.GetBytes(postData); 
 // 准备请求
            // 准备请求 
 
 try
            try 
 {
            {    
 // 设置参数
                // 设置参数 
 request = WebRequest.Create(url) as HttpWebRequest;
                request = WebRequest.Create(url) as HttpWebRequest; 
 CookieContainer cookieContainer = new CookieContainer();
                CookieContainer cookieContainer = new CookieContainer(); 
 request.CookieContainer = cookieContainer;
                request.CookieContainer = cookieContainer; 
 request.AllowAutoRedirect = true;
                request.AllowAutoRedirect = true; 
 request.Method = "POST";
                request.Method = "POST"; 
 request.ContentType = "application/x-www-form-urlencoded";
                request.ContentType = "application/x-www-form-urlencoded"; 
 request.ContentLength = data.Length;
                request.ContentLength = data.Length; 
 outstream = request.GetRequestStream();
                outstream = request.GetRequestStream(); 
 outstream.Write(data,0,data.Length);
                outstream.Write(data,0,data.Length); 
 outstream.Close();
                outstream.Close(); 
 //发送请求并获取相应回应数据
                //发送请求并获取相应回应数据 
 response = request.GetResponse() as HttpWebResponse;
                response = request.GetResponse() as HttpWebResponse; 
 //直到request.GetResponse()程序才开始向目标网页发送Post请求
                //直到request.GetResponse()程序才开始向目标网页发送Post请求 
 instream = response.GetResponseStream();
                instream = response.GetResponseStream(); 
 sr = new StreamReader( instream, encoding );
                sr = new StreamReader( instream, encoding ); 
 //返回结果网页(html)代码
                //返回结果网页(html)代码 
 string content = sr.ReadToEnd();
                string content = sr.ReadToEnd(); 
 err = string.Empty;
                err = string.Empty; 
 return content;
                return content; 
 }
            } 
 catch(Exception ex)
            catch(Exception ex) 
 {
            { 
 err = ex.Message;
                err = ex.Message; 
 return string.Empty;
                return string.Empty; 
 }
            } 
 }
        } 
第三部分:分析Html数据,有两个开源软件
SgmlReader与HtmlAgilityPack20,由于本人机器上只有vs2003,无法使用vs2005版本HtmlAgilityPack20。所以下面用SgmlReader来分析。SgmlReader可以将Html解析成格式完整的类似XML数据,可以采用Xpath进行查询,获取我们想要的数据。
取得完整的xml数据后的分析,根据post页面数据格式的不同而有区别。我取的这个页面,主要用了两个DataTable,一个保存一行基本数据,另一个保存多行的状态数据。
 public static DataSet ParsePage(string pageContent, string xclpath,string xrpath,out string err)
public static DataSet ParsePage(string pageContent, string xclpath,string xrpath,out string err) {
        {
 err = string.Empty;
            err = string.Empty; DataSet ds = new DataSet();
            DataSet ds = new DataSet(); DataTable table = new DataTable("QueryResult1");
            DataTable table = new DataTable("QueryResult1"); DataTable table1 = new DataTable("QueryResult2");
            DataTable table1 = new DataTable("QueryResult2");
 
            
 
             StringWriter strWriter = null;
            StringWriter strWriter = null; SgmlReader sgmlReader = null;
            SgmlReader sgmlReader = null; XmlTextWriter xmlWriter = null;
            XmlTextWriter xmlWriter = null;
 try
            try  {
            { sgmlReader = new SgmlReader();
                sgmlReader = new SgmlReader(); sgmlReader.DocType = "HTML";
                sgmlReader.DocType = "HTML"; sgmlReader.InputStream = new StringReader(pageContent);
                sgmlReader.InputStream = new StringReader(pageContent); strWriter = new StringWriter();
                strWriter = new StringWriter(); xmlWriter = new XmlTextWriter(strWriter);
                xmlWriter = new XmlTextWriter(strWriter); xmlWriter.Formatting = Formatting.Indented;
                xmlWriter.Formatting = Formatting.Indented;
 sgmlReader.Read();
                sgmlReader.Read(); while (!sgmlReader.EOF)
                while (!sgmlReader.EOF)  {
                {                     xmlWriter.WriteNode(sgmlReader, true);
                    xmlWriter.WriteNode(sgmlReader, true); 
                     }
                }  xmlWriter.Flush();
                xmlWriter.Flush(); xmlWriter.Close();
                xmlWriter.Close();
 string wellFormedHTML = strWriter.ToString();
                string wellFormedHTML = strWriter.ToString();

 if(xclpath.Trim().Length == 0)
                if(xclpath.Trim().Length == 0) return ds;
                    return ds; 
                 
                 XPathDocument doc = new XPathDocument(new StringReader(wellFormedHTML));
                XPathDocument doc = new XPathDocument(new StringReader(wellFormedHTML)); XPathNavigator nav = doc.CreateNavigator();
                XPathNavigator nav = doc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xclpath);
                XPathNodeIterator nodes = nav.Select(xclpath);
 int i = 0;
                int i = 0; while (nodes.MoveNext())
                while (nodes.MoveNext())  {
                { 
                     string sNodeText = nodes.Current.Value;
                    string sNodeText = nodes.Current.Value; if( i < nodes.Count - 5)
                    if( i < nodes.Count - 5) {
                    { if( i< 17)
                        if( i< 17) {
                        { if(table.Columns.Contains(sNodeText))
                            if(table.Columns.Contains(sNodeText)) {
                            { sNodeText = sNodeText + i.ToString();
                                sNodeText = sNodeText + i.ToString(); }
                            } table.Columns.Add(sNodeText ,typeof(string));
                            table.Columns.Add(sNodeText ,typeof(string)); }
                        } 
                         }
                    } else
                    else {
                    { if(table1.Columns.Contains(sNodeText))
                        if(table1.Columns.Contains(sNodeText)) {
                        { sNodeText = sNodeText + i.ToString();
                            sNodeText = sNodeText + i.ToString(); }
                        } table1.Columns.Add(sNodeText ,typeof(string));
                        table1.Columns.Add(sNodeText ,typeof(string)); }
                    } i ++;
                    i ++; 
                     }
                }
 ds.Tables.Add(table);
                ds.Tables.Add(table); ds.Tables.Add(table1);
                ds.Tables.Add(table1);


 bool bNext = false;
                bool bNext = false; nodes = nav.Select(xrpath);
                nodes = nav.Select(xrpath);
 DataRow row = table.NewRow();
                DataRow row = table.NewRow(); table.Rows.Add(row);
                table.Rows.Add(row);
 DataRow row1 = null;
                DataRow row1 = null;                 int j = 0;
                int j = 0; int k = 0;
                int k = 0; while (nodes.MoveNext())
                while (nodes.MoveNext())  {
                { string nodetext = nodes.Current.Value;
                    string nodetext = nodes.Current.Value;
 if(table.Columns.Contains(nodetext) || table1.Columns.Contains(nodetext))
                    if(table.Columns.Contains(nodetext) || table1.Columns.Contains(nodetext)) {
                    { continue;
                        continue; }
                    }
 if(!bNext)
                    if(!bNext) {
                    { if ( nodetext == "正在预录入")
                        if ( nodetext == "正在预录入") {
                        { bNext = true;
                            bNext = true; }
                        } }
                    } if(!bNext)
                    if(!bNext) {
                    {                         if( j < 17)
                        if( j < 17) {
                        { row[j] = nodetext;
                            row[j] = nodetext; j++;
                            j++; }
                        } }
                    } else
                    else {
                    { 
                         if( k == 0)
                        if( k == 0) {
                        { row1 = table1.NewRow();
                            row1 = table1.NewRow(); table1.Rows.Add(row1);
                            table1.Rows.Add(row1); }
                        }
 row1[k] = nodetext;
                        row1[k] = nodetext; k = (k + 1) % 5;
                        k = (k + 1) % 5;
 }
                    }     }
                }
 err = string.Empty;
                err = string.Empty;
 return ds;
                return ds; }
            }  catch (Exception exp)
            catch (Exception exp)  {
            { 
                 err = exp.Message;
                err = exp.Message; return ds;
                return ds; }
            }
 }
        }有了上面的代码就可以采用如下方法调用了
 private void Button1_Click(object sender, System.EventArgs e)
private void Button1_Click(object sender, System.EventArgs e) {
        {    
 string sHtml = string.Empty;
            string sHtml = string.Empty; string sErr = string.Empty;
            string sErr = string.Empty; string sUrl = @"http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20";
            string sUrl = @"http://www.suzhou-logistics.com/search.asp?key=2006002995&ys_type=hy&imageField2.x=32&imageField2.y=20"; DataSet ds;
            DataSet ds;
 sHtml = WebForm1.GetPage(sUrl , string.Empty, "GB18030",out sErr);
            sHtml = WebForm1.GetPage(sUrl , string.Empty, "GB18030",out sErr);
 if(sErr == string.Empty)
            if(sErr == string.Empty) {
            { string xcpath = @"/html/table/tr/td/table/tr/td/font/strong";
                string xcpath = @"/html/table/tr/td/table/tr/td/font/strong"; string xrpath = @"/html/table/tr/td/table/tr/td";
                string xrpath = @"/html/table/tr/td/table/tr/td"; ds = WebForm1.ParsePage(sHtml, xcpath, xrpath,out sErr);
                ds = WebForm1.ParsePage(sHtml, xcpath, xrpath,out sErr);
 if((sErr == string.Empty) && (ds.Tables.Count == 2))
                if((sErr == string.Empty) && (ds.Tables.Count == 2)) {
                { if(ds.Tables[0].Rows.Count > 0)
                    if(ds.Tables[0].Rows.Count > 0) {
                    { DataGrid1.DataSource = ds.Tables[0];
                        DataGrid1.DataSource = ds.Tables[0]; DataGrid1.DataBind();
                        DataGrid1.DataBind(); }
                    }
 if( ds.Tables[1].Rows.Count > 0)
                    if( ds.Tables[1].Rows.Count > 0) {
                    { DataGrid2.DataSource = ds.Tables[1];
                        DataGrid2.DataSource = ds.Tables[1]; DataGrid2.DataBind();
                        DataGrid2.DataBind(); }
                    } }
                } }
            }
 
             }
        }
 
其实SgmlReader可以直接完成从URl抓取数据的功能,即将第二部分与第三部分合并。
 string SgmlReaderTest(Uri baseUri, string url, TextWriter log, bool upper, bool formatted)
string SgmlReaderTest(Uri baseUri, string url, TextWriter log, bool upper, bool formatted) {
        { string inputUri = url;
            string inputUri = url; 
             try
            try  {
            { SgmlReader r = new SgmlReader();
                SgmlReader r = new SgmlReader(); r.SetBaseUri(Server.MapPath("."));
                r.SetBaseUri(Server.MapPath(".")); r.DocType = "HTML";
                r.DocType = "HTML"; r.Href = url;
                r.Href = url; if (upper) r.CaseFolding = CaseFolding.ToUpper;
                if (upper) r.CaseFolding = CaseFolding.ToUpper; StringWriter sw = new StringWriter();
                StringWriter sw = new StringWriter(); XmlTextWriter w = new XmlTextWriter(sw);
                XmlTextWriter w = new XmlTextWriter(sw);
 if (formatted)
                if (formatted)  {
                { w.Formatting = Formatting.Indented;
                    w.Formatting = Formatting.Indented; r.WhitespaceHandling = WhitespaceHandling.None;
                    r.WhitespaceHandling = WhitespaceHandling.None; }
                } r.Read();
                r.Read(); while (!r.EOF)
                while (!r.EOF)  {
                { 
                     w.WriteNode(r, true);
                    w.WriteNode(r, true); }
                } w.Flush();
                w.Flush(); w.Close();
                w.Close(); return sw.ToString();
                return sw.ToString(); }
            }  catch (Exception e)
            catch (Exception e)  {
            { return e.ToString();
                return e.ToString(); }
            } }
        } 
                    
                 
         
             
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号