数据采集[即与 WEB 相关的功能函数]

--

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;

namespace ToolLibrary
{
    /// <summary>
    /// 网络爬虫[数据采集] [即与 WEB 相关的功能函数]
    /// [wzrong 2008-11-06] 
    /// QQ:120152169
    /// Email:w_zrong@163.com
    /// </summary>
    public class WebCrawler
    {

        #region 根据网站地址(URL)获取整站的 HTML

        /// <summary>
        /// 根据网站地址(URL)获取整站的 HTML
        /// </summary>
        /// <param name="urlPath">网站地址(URL)</param>
        /// <returns>整站的 HTML</returns>
        public static string GetHtmlContentsByUrl(string urlPath)
        {
            string returnStr = string.Empty;
            try
            {
                WebClient client = new WebClient(); //向URL标识的资源发送数据和从URL标识的资源接收数据

                returnStr = client.DownloadString(urlPath);//以字符串的形式下载资源

                client.Dispose();
            }
            catch 
            {
                returnStr = "";
            }

 

            return returnStr;
        }

        #endregion

        #region 根据正则表达式(手动配置表达式)获取指定信息 返回 ArrayList

        /// <summary>
        /// 根据正则表达式(手动配置表达式)获取指定信息 返回 ArrayList
        /// </summary>
        /// <param name="htmlSource">HTML源码</param>
        /// <param name="strRegex">正则表达式(手动配置表达式)</param>
        /// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
        /// <returns>指定信息集合</returns>
        public static ArrayList GetHtmlArrayByRegex(string htmlSource, string strRegex, bool isRightToLeft)
        {
            ArrayList array = new ArrayList();
            Regex rex;

            string html = htmlSource.Replace("\r\n", "").Replace("\r", "").Replace("\t", "");

            if (isRightToLeft)
            {
                rex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.RightToLeft);
            }
            else
            {
                rex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            }

            MatchCollection mc = rex.Matches(html); //迭代匹配 (在指定的字符串中搜索正则表达式的所有匹配)

            foreach (Match m in mc)
            {
                string matchStr = m.Groups[1].ToString().Trim(); //获取由正则表达式匹配的组的集合
                array.Add(matchStr);
            }
            return array;
        }

        #endregion

        #region 根据正则表达式(起始标志,结束标志)获取指定的信息 返回 ArrayList

        /// <summary>
        /// 根据正则表达式(起始标志,结束标志)获取指定的信息 返回 ArrayList
        /// </summary>
        /// <param name="htmlSource">HTML源码</param>
        /// <param name="startRex">起始标志</param>
        /// <param name="endRex">结束标志</param>
        /// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
        /// <returns>指定信息集合</returns>
        public static ArrayList GetHtmlArrayByRegex(string htmlSource, string startRex, string endRex, bool isRightToLeft)
        {
            string returnRex = startRex + "(.*?)" + endRex;
            return GetHtmlArrayByRegex(htmlSource, returnRex, false);
        }

        #endregion

        #region 根据正则表达式(起始标志,结束标志)获取指定的信息 返回字符串 String

        /// <summary>
        /// 根据正则表达式(起始标志,结束标志)获取指定的信息 返回字符串 String
        /// </summary>
        /// <param name="htmlSource">HTML源码</param>
        /// <param name="startRex">起始标志</param>
        /// <param name="endRex">结束标志</param>
        /// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
        /// <returns>指定信息的字符串</returns>
        public static String GetHtmlStrByRegex(string htmlSource, string startRex, string endRex, bool isRightToLeft)
        {
            //string returnStr = string.Empty;
            StringBuilder sb = new StringBuilder();

            string regexStr = startRex + "(.*?)" + endRex;
           
            ArrayList array = new ArrayList();
            array = GetHtmlArrayByRegex(htmlSource, regexStr, false);

            for (int i = 0; i < array.Count; i++)
            {
                //returnStr = array[i].ToString();
                sb.Append(array[i].ToString());

            }
            //return returnStr;
            return sb.ToString();
        }

        #endregion

        #region 得到分页的连接地址

        /// <summary>
        /// 得到分页连接地址
        /// </summary>
        /// <param name="oldPageUrl">原连接地址</param>
        /// <param name="PageTags">分页标签</param>
        /// <returns>分页连接地址</returns>
        public static String GetPageUrl(string oldPageUrl, string PageTags)
        {
            string newPageUrl = string.Empty;

           
            return newPageUrl;
        }

        #endregion

        #region 得到网页图片的地址[多个则用分割符隔开累加]

        /// <summary>
        /// 得到网页图片地址 [多个则用分割符隔开累加]
        /// </summary>
        /// <param name="html">包含图片的 HTML 代码</param>
        /// <returns>图片地址</returns>
        public static String GetHtmlImgUrl(string html)
        {
            string returnStr = "";

            ArrayList array = new ArrayList();

            array = GetHtmlArrayByRegex(html, "src=\"", "\"", false);

            for (int i = 0; i < array.Count; i++)
            {
                if (i == 0)
                {
                    returnStr = array[i].ToString();
                }
                else
                {
                    returnStr = array[i].ToString() + Common.CommonConst.GAP_CHAR1 + returnStr;
                }
            }

            return returnStr;
        }

        #endregion

        #region 得到有效的连接地址 (对不包含域名的地址加上域名)

        /// <summary>
        /// 得到有效的连接地址 (对不包含域名的地址加上域名)
        /// </summary>
        /// <param name="oldUrl">原始地址</param>
        /// <param name="domainUrl">域名地址 如http://www.baidu.com/ </param>
        /// <returns>有效的连接地址</returns>
        public static String GetValidUrl(string oldUrl, string domainUrl)
        {
            string newUrl = oldUrl;

            string http = "http://";

            if (!oldUrl.Contains(http))
            {
                if (oldUrl.StartsWith("/") && domainUrl.EndsWith("/"))
                {
                    newUrl = domainUrl.Remove(domainUrl.Length - 1, 1) + oldUrl;
                }
                else if (!oldUrl.StartsWith("/") && domainUrl.EndsWith("/"))
                {
                    newUrl = domainUrl + "/" + oldUrl;
                }
                else
                {
                    newUrl = domainUrl + oldUrl;
                }
            }
            return newUrl;
        }

        #endregion

        #region 获取文件后缀和文件名称

        /// <summary>
        /// 获取文件后缀和文件名称
        /// 如果文件字符串连同路径传值,则返回文件名也包含路径
        /// </summary>
        /// <param name="fileStr">文件字符串[名称和后缀(可以包含路径) 如:txtName.txt]</param>
        /// <param name="splitChr">分割符 如.</param>
        /// <param name="fileName">文件名</param>
        /// <param name="suffix">后缀</param>
        public static void GetFileNameAndSuffix(string fileStr, char splitChr, out string fileName, out string suffix)
        {
            if (fileStr.Trim() == string.Empty)
            {
                fileName = suffix = "";
                return;
            }
            if (!fileStr.Contains(splitChr))
            {
                fileName = suffix = "";
                return;
            }

            int index = fileStr.LastIndexOf(splitChr);

            fileName = fileStr.Substring(0, index);

            suffix = fileStr.Substring(index + 1);
        }

        #endregion

        #region 获取网络图片命名名称和后缀 [如:命名名称.jpg]

        /// <summary>
        /// 获取网络图片命名名称和后缀 [如:命名名称.jpg]
        /// </summary>
        /// <param name="imgUrl">网络图片连接地址</param>
        /// <param name="isOverWriteName">是否从写图片名称?true:false</param>
        /// <returns>图片命名名称和后缀</returns>
        public static string GetImgNameAndSuffix(string imgUrl, bool isOverWriteName)
        {
            //例如:/images/bg7.jpg
            string imgName = "";

            if (imgUrl.Contains("/"))
            {
                imgName = imgUrl.Substring(imgUrl.LastIndexOf("/") + 1);
            }
            else
            {
                imgName = imgUrl;
            }

            //重写图片名称
            if (isOverWriteName)
            {
                string fileName, sufFix;
                GetFileNameAndSuffix(imgUrl, '.', out fileName, out sufFix);
                imgName = DateTime.Now.ToString("yyMMddhhmmss") + DateTime.Now.Millisecond.ToString() + "." + sufFix;
            }

            return imgName;
        }

        #endregion

        #region 从网络上下载图片到本地服务器

        /// <summary>
        /// 从网络上下载图片到本地服务器
        /// </summary>
        /// <param name="imgUrl">网络图片的连接地址 </param>
        /// <param name="imgSavePath">要接收数据的本地文件名称</param>
        /// <param name="domainUrl">域名地址 如: http://www.baidu.com </param>
        public static void DownLoadImgToLocal(string imgUrl, string imgSavePath)
        {
            try
            {
                WebClient client = new WebClient();
                client.DownloadFile(imgUrl, imgSavePath);
                client.Dispose();
            }
            catch {

            }
        }

        #endregion

        #region 重写显示图片的 HTML 代码 <img />

        /// <summary>
        /// 重写图片显示的HTML代码 返回格式:[img SRC="imgPath" ALT="imgTitle" /]
        /// </summary>
        /// <param name="imgOldHtml">原始IMG显示的HTML代码</param>
        /// <param name="imgNewSavePath">图片存放新地址/路径</param>
        /// <param name="imgTitle">图片标题</param>
        /// <returns>返回格式:src="imgPath" alt="imgTitle"</returns>
        public static string OverWriteImgUrlInHtml(string imgOldHtml, string imgNewSavePath, string imgTitle)
        {
            string returnStr = "";

            string imgPath = ""; //img本地存放路径

            string imgUrl = GetHtmlImgUrl(imgOldHtml); //img网络连接地址

            string imgName = GetImgNameAndSuffix(imgUrl, false); // img名称

            if (imgNewSavePath.EndsWith("/"))
            {
                imgPath = imgNewSavePath;
            }
            else
            {
                imgPath = imgNewSavePath + "/";
            }

            returnStr = "<img src=\"" + imgPath + imgName + "\" alt=\"" + imgTitle + "\">" + "<br/> ";
            return returnStr;
        }

        #endregion

        #region 将 HTML 中的图片地址替换成本地地址 并将其下载到本地服务器中 返回改写图片地址后的 HTML 文本

        /// <summary>
        /// 将 HTML 中的图片地址替换成本地地址
        /// 并将其下载到本地服务器中
        /// 返回改写图片地址后的 HTML 文本
        /// </summary>
        /// <param name="htmlSource">原始 HTML 文本串</param>
        /// <param name="domainUrl">图片所在服务器域名地址</param>
        /// <returns></returns>
        public static string ReplaceImgDirInHtml(string htmlSource, string domainUrl)
        {
            string returnStr = htmlSource;
            ArrayList array = new ArrayList();

            array = GetHtmlArrayByRegex(htmlSource, "<img ", ">", false);

            returnStr = returnStr.Replace("<img", "");

            for (int i = 0; i < array.Count; i++)
            {
                //带HTML的图片地址
                string imgOldHtml = array[i].ToString(); //src = "http://www.11kp.com/images/20070423/1234fg32.jpg"

                //存放图片的文件夹路径 如: /images/news/20081107/03/
                string imgSavePath = IOFunction.CreateFolder(Common.CommonConst.IMG_SAVE_DIR);

                //原始图片地址 如: http://www.11kp.com/images/20070423/1234fg32.jpg
                string imgOldUrl = GetHtmlImgUrl(imgOldHtml);

                //得到有效的连接地址
                string imgValidUrl = GetValidUrl(imgOldUrl, domainUrl);

                //得到图片名称
                string imgName = GetImgNameAndSuffix(imgValidUrl, true);

                //下载图片
                DownLoadImgToLocal(imgValidUrl, imgSavePath + imgName);

                string imgNewHtml = OverWriteImgUrlInHtml(imgOldHtml, imgSavePath, "十一宽频");

                //替换图片原连接地址为新连接地址
                returnStr = returnStr.Replace(imgOldHtml, imgNewHtml);

            }

            returnStr = returnStr.Replace(">>", ">");

            return returnStr;
        }

        #endregion

        /// <summary>
        /// 获取网页内容
        /// </summary>
        /// <param name="url">网页路径</param>
        /// <returns></returns>
        public static string getWebHtmlCotent(string url)
        {
            try
            {
                byte[] b_text = new System.Net.WebClient().DownloadData(url);

                return System.Text.Encoding.Default.GetString(b_text);
            }
            catch
            {
                return "";
            }
        }

        /// <summary>
        /// 获取网页内容
        /// </summary>
        /// <param name="url">网页路径</param>
        /// <param name="encode">编码方式</param>
        /// <returns></returns>
        public static string getWebHtmlCotent(string url, System.Text.Encoding encode)
        {
            try
            {
                byte[] b_text = new System.Net.WebClient().DownloadData(url);

                return encode.GetString(b_text);
            }
            catch
            {
                return "";
            }
        }

        /// <summary>
        /// 清除所有HTML标记
        /// </summary>
        /// <param name="HtmlContents"></param>
        /// <returns></returns>
        public static string getClearHtmlCode(string HtmlContents)
        {

            HtmlContents = HtmlContents.Replace(" ", "").Replace("\t", "").Replace("\r\n", "");

            //先清除js

            HtmlContents = Regex.Replace(HtmlContents, "<script*.?/script>", "", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "<.*?>", "", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "&nbsp;", "", RegexOptions.IgnoreCase);

            return HtmlContents;

        }

        /// <summary>
        /// 清除 HTML 标记中的图片
        /// </summary>
        /// <param name="HtmlContents"></param>
        /// <returns></returns>
        public static string doClearHtmlTagSaveImg(string HtmlContents)
        {
            string Contents = HtmlContents;

            Match m;

            Match m1;

            Regex r = new Regex("<img.*?>", RegexOptions.IgnoreCase | RegexOptions.Compiled);

            for (m = r.Match(Contents); m.Success; m = m.NextMatch())
            {
                string tempstr = m.Groups[0].ToString();

                string oldImgTag = tempstr;

                string newImgTag = "";

                Regex r1 = new Regex("src=\".*?\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                for (m1 = r1.Match(tempstr); m1.Success; )
                {
                    newImgTag = m1.Groups[0].ToString();

                    break;
                }
                if (newImgTag != "")
                {
                    newImgTag = "&ltt;img " + newImgTag + "&rtt;";

                    HtmlContents = HtmlContents.Replace(oldImgTag, newImgTag) + "<br>";
                }
            }
            HtmlContents = Regex.Replace(HtmlContents, "\r\n", "", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "<br>", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "<br >", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "<br />", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "</p>", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "&nbsp;\r\n", "", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "<script*.?/script>", "", RegexOptions.IgnoreCase);

            HtmlContents = Regex.Replace(HtmlContents, "<.*?>", "", RegexOptions.IgnoreCase);

            HtmlContents = HtmlContents.Replace("&ltt;", "<").Replace("&rtt;", ">");

            HtmlContents = HtmlContents.Replace("    ", " ").Replace("   ", " ").Replace("  ", " ").Replace("  ", " ").Replace("\t", "");

            HtmlContents = HtmlContents.Replace("<br /><br /><br /><br />", "<br />").Replace("<br /><br /><br />", "<br />").Replace("<br /><br />", "<br />");

            HtmlContents = HtmlContents.Replace("<br /> <br />", "<br />").Replace("<br /> <br />", "<br />").Replace("<br /> <br />", "<br />");

            return HtmlContents;
        }

    }
}

 

posted @ 2008-11-13 10:03  白胡子海盗  阅读(527)  评论(0编辑  收藏  举报