在webForm中WebRequest\WebClient\WebBrowser获取远程页面源码的三种方式（downmoon)

一个小需求，获取远程页面的源码，主要用于抓数据。原来用的好好的，最近突然不能获取页面源码了，但是仍然可以用浏览器正常浏览。（文后附源码下载。^_^）

　　经过分析，原来用的代码如下：

StreamReader sreader = null;

string result = string.Empty;

try

{

HttpWebRequest httpWebRequest = (HttpWebRequest)WebRequest.Create(Url);

//httpWebRequest.Timeout = 20;

httpWebRequest.KeepAlive = false;

#endregion

HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();

if (httpWebResponse.StatusCode == HttpStatusCode.OK)

{

sreader = new StreamReader(httpWebResponse.GetResponseStream(), encoding);

result = reader.ReadToEnd();

if (null != httpWebResponse) { httpWebResponse.Close(); }

return result;

}

return result; ;

}

catch (WebException e) { return null; }

finally { if (sreader != null) { sreader.Close(); } }

查了下资料，原来需要加参数。
　　　　　　#region 关键参数，否则会取不到内容　Important Parameters,else get nothing.
                httpWebRequest.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";
                httpWebRequest.Accept = "*/*";
                httpWebRequest.KeepAlive = true;
                httpWebRequest.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                #endregion

修正后的代码如下：

Code

#endregion

public static void ReDial()

{

int res = 1;

////while (res != 0)

////{

//// CSDNWebTest.RASDisplay ras = new RASDisplay();

//// ras.Disconnect();

//// res = ras.Connect("asdl");

//// System.Threading.Thread.Sleep(TimeSpan.FromSeconds(10));

////}

}

问题是解决了，后来再想了想，可以用WebClient先把页面download到本地临时文件，再读取文本内容。

代码如下：

private string GetPageByWebClient(string url)

{

string result = null;

if (url.Equals("about:blank")) return null; ;

if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }

string filename = RandomKey(1111, 9999) + ".txt";

DownloadOneFileByURLWithWebClient(filename, url, "C:\\");

StreamReader sr = new StreamReader("c:\\" + filename, System.Text.Encoding.Default);

try { result = sr.ReadToEnd(); return result; }

catch { return null; }

finally

{

if (sr != null) { sr.Close(); }

}

private string RandomKey(int b, int e)

{

return DateTime.Now.ToString("yyyyMMdd-HHmmss-fff-") + this.getRandomID(b, e);

}

private int getRandomID(int minValue, int maxValue)

{

Random ri = new Random(unchecked((int)DateTime.Now.Ticks));

int k = ri.Next(minValue, maxValue);

return k;

}

private string GuidString

{

get { return Guid.NewGuid().ToString(); }

}

　　　　///Web Client Method ,only For Small picture

/// </summary>

/// <param name="fileName"></param>

/// <param name="url"></param>

/// <param name="localPath"></param>

public static void DownloadOneFileByURLWithWebClient(string fileName, string url, string localPath)

{

System.Net.WebClient wc = new System.Net.WebClient();

if (File.Exists(localPath + fileName)) { File.Delete(localPath + fileName); }

if (Directory.Exists(localPath) == false) { Directory.CreateDirectory(localPath); }

wc.DownloadFile(url + fileName, localPath + fileName);

}

结果不能获取源码。错误如下：

再想想，还有Webbrowser控件可以用啊。在WinFrom下只要在主线程前加[STAThread]即可。

[STAThread]

public void GetURLContentByWebBrowser()

{

try

{

//webBrowser1 = new WebBrowser();

string url = txtUrl.Text.Trim();

string result = null;

WebBrowser wb = new WebBrowser();

////if (wb != null){ wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted); }

if (String.IsNullOrEmpty(url)) return;

if (url.Equals("about:blank")) return; ;

if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }

try

{

wb.Navigate(new Uri(url));

result = wb.DocumentText;

lbResult.Text = result;

}

catch (System.UriFormatException)

{ }

return;

}

catch (Exception ex)

{

//WriteLog.Writelog("这是获取页面全部html代码时发生的错误：" + url, ex);

throw ex;

//return ;

}

在WebForm就麻烦些了，出现错误，线程不在单线程单元中，故无法实例化 ActiveX 控件“8856f961-340a-11d0-a96b-00c04fd705a2”

代码如下：

private string GetPageStringbyWebBrowser(string url)

{

if (url.Equals("about:blank")) return null; ;

if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }

WebBrowser myWB = new WebBrowser();

myWB.ScrollBarsEnabled = false;

myWB.Navigate(url);

while (myWB.ReadyState != WebBrowserReadyState.Complete)

{

System.Windows.Forms.Application.DoEvents();

}

if (myWB != null)

{

System.IO.StreamReader getReader = null;

try

{

getReader = new System.IO.StreamReader(myWB.DocumentStream, System.Text.Encoding.GetEncoding(myWB.Document.Encoding));

string gethtml = getReader.ReadToEnd();

return gethtml;

}

catch { return null; }

finally

{

if (getReader != null) { getReader.Close(); }

myWB.Dispose();

}

return null;

}

后来搜索N小时(N>=5)后，终于找到可行解决方案,在WebPage页面头部加入AspCompat="true"

即<%@ Page Language="C#" AspCompat="true" ******/>

MSDN给出的解释是：
在 ASP .NET 网页的 <%@Page> 标记中包含兼容性属性 aspcompat=true，如 <%@Page aspcompat=true Language=VB%>。使用此属性将强制网页以 STA 模式执行，从而确保您的组件可以继续正确运行。如果试图使用 STA 组件但没有指定此标记，运行时将会发生异常情况。

将此属性的值设置为 true 时，将允许网页调用 COM+ 1.0 组件，该组件需要访问非管理的 ASP 内置对象。可以通过 ObjectContext 对象进行访问。

如果将此标记的值设为 true，性能会稍微有些下降。建议只在确实需要时才这样做。

终于可以了！　不知道有没有更好的方法？？

附：源码下载。

邀月注：

如果不能测试，请注意是否在域（AD)环境下，如果是！　请注意设置代理和防火墙
请参考：
http://dev.csdn.net/article/83914.shtm

或http://blog.csdn.net/downmoon/archive/2006/04/14/663337.aspx

或http://www.cnblogs.com/downmoon/archive/2007/12/29/1019701.html

邀月注：本文版权由邀月和博客园共同所有，转载请注明出处。

posted @ 2010-01-22 02:59 cwchyt 阅读(1021) 评论(0) 收藏举报

刷新页面返回顶部

c# 学习

专注学习c#

在webForm中WebRequest\WebClient\WebBrowser获取远程页面源码的三种方式（downmoon)

公告