CollectionHelper-网页采集辅助类

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;

namespace Framework
{
    /// <summary>
    /// 网页采集辅助类
    /// </summary>
    public static class CollectionHelper
    {
        /// <summary>
        /// 取得字符里的Dom元素 不包含元素属性
        /// </summary>
        /// <param name="orgStr"></param>
        /// <param name="domElem"></param>
        /// <returns></returns>
        public static List<string> GetDomElem(string orgStr, string domElem)
        {
            List<string> matchList = new List<string>();
            string regStr = string.Format("<{0}[^>]*?>[\\s\\S]+?<\\/{0}>", domElem);
            try
            {
                Regex regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(orgStr);
                StringBuilder sb = new StringBuilder();
                foreach (Match match in matches)
                {
                    matchList.Add(match.Value);
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message);
            }
            return matchList;
        }

        /// <summary>
        /// 取得字符里的Dom元素 包含元素属性 如：class="aa"
        /// </summary>
        /// <param name="orgStr"></param>
        /// <param name="tagName"></param>
        /// <param name="tagValue"></param>
        /// <returns></returns>
        public static List<string> GetDomElemByAttr(string orgStr, string tagName, string tagValue)
        {
            List<string> matchList = new List<string>();
            string regStr = string.Format(@"<(?<HtmlTag>[\w]+)[^>]*\s{0}=(?<Quote>[""']?){1}(?(Quote)\k<Quote>)[""']?[^>]*>((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|[\s\S]*?)*</\k<HtmlTag>>", tagName.ToLower(), tagValue);
            try
            {
                Regex regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(orgStr);
                StringBuilder sb = new StringBuilder();
                foreach (Match match in matches)
                {
                    matchList.Add(match.Value);
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message);
            }
            return matchList;
        } 

        /// <summary>
        /// 取得字符里的A元素键值对  [name，url]
        /// </summary>
        /// <param name="orgStr"></param>
        /// <param name="domElem"></param>
        /// <returns></returns>
        public static Dictionary<string, string> GetDomElem_A(string orgStr)
        {
            Dictionary<string, string> matchList = new Dictionary<string, string>();
            string regStr1 = "<a[^>]*? href=[\"'](?<url>[^\"']*?)[\"'][^>]*?>(?<text>[\\w\\W]*?)</a>";
            try
            {
                Regex regex = new Regex(regStr1, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(orgStr);
                StringBuilder sb = new StringBuilder();
                foreach (Match match in matches)
                {
                    string key = match.Value.RemoveHtml();
                    if (!matchList.ContainsKey(key))
                    {
                        matchList.Add(key, GetUrlArray(match.Value)[0]);
                    }
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message, "");
            }
            return matchList;
        }


        /// <summary>
        /// 获取网页源码
        /// </summary>
        /// <param name="url">要获取源码的网页地址</param>
        /// <param name="coding">编码</param>
        /// <returns>返回获取的网页源代码</returns>
        public static string GetPageSourceByUrl(string url, string coding = "gb2312")
        {
            return GetPageSourceByUrl(new Uri(url), coding);
        }

        /// <summary>
        /// 获取网页源码
        /// </summary>
        /// <param name="url">要获取源码的网页地址</param>
        /// <param name="coding">编码</param>
        /// <returns>返回获取的网页源代码</returns>
        public static string GetPageSourceByUrl(Uri url, string coding = "gb2312")
        {
            string getSource = string.Empty;
            try
            {
                HttpWebRequest httpwebrequest = (HttpWebRequest)WebRequest.Create(url);
                HttpWebResponse httpwebresponse = (HttpWebResponse)httpwebrequest.GetResponse();
                Stream stream = httpwebresponse.GetResponseStream();
                StreamReader streamreader = new StreamReader(stream, Encoding.GetEncoding(coding));
                getSource = streamreader.ReadToEnd();
                stream.Close();
                httpwebresponse.Close();
            }
            catch (NotSupportedException exception)
            {
                getSource = exception.Message;
            }
            catch (InvalidOperationException exception)
            {
                getSource = exception.Message;
            }
            catch (IOException exception)
            {
                getSource = exception.Message;
            }
            return getSource;
        }

        /// <summary>
        /// 获取页面内容后，用匹配url正则表达式抓取内容中的url
        /// </summary>
        /// <param name="code">列表代码</param>
        /// <returns>返回截取后的URL地址</returns>
        public static List<string> GetUrlArray(string code)
        {
            List<string> urlList = new List<string>();
            Regex regex = new Regex(@"(http://)?[\w-\.]*([\/]?[\w-])+[\w-]*\.(htm|html|shtm|shtml|aspx|asp|php|jsp)+[\w-\=\?]*", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            MatchCollection matches = regex.Matches(code);
            foreach (Match match in matches)
            {
                urlList.Add(match.Value);
            }
            return urlList;
        }
        /// <summary>
        /// 获取内容code中所有都图片地址
        /// </summary>
        /// <param name="code">内容</param>
        /// <returns>返回截取后都图片地址</returns>
        public static Dictionary<string, string> GetImgUrlArray(string content)
        {
            Dictionary<string, string> imgList = new Dictionary<string, string>();
            Regex reg = new Regex(@"<img[\s\S]*?src=(""(?<src>[^']*?)""|'(?<src>[^']*?)'|(?<src>[^>\s]*))[^>]*?>(.*?)");
            MatchCollection m = reg.Matches(content.ToLower());
            foreach (Match match in m)
            {
                string matchValue = match.Groups["src"].Value;
                if (!imgList.ContainsKey(matchValue))
                {
                    imgList.Add(matchValue, matchValue);
                }
            }
            return imgList;
        }

        /// <summary>
        /// 将相对地址转换为绝对地址
        /// </summary>
        /// <param name="relativeAddress">要转换的相对地址</param>
        /// <param name="absoluteAddress">当前网页地址</param>
        /// <returns>返回转换后的地址</returns>
        public static string ConvertToAbsluteUrl(string relativeAddress, string absoluteAddress)
        {
            if (string.IsNullOrEmpty(relativeAddress))
            {
                return string.Empty;
            }
            if (relativeAddress.Contains("://"))
            {
                return relativeAddress;
            }
            if (string.IsNullOrEmpty(absoluteAddress))
            {
                return string.Empty;
            }
            if (!absoluteAddress.Contains("://"))
            {
                return string.Empty;
            }
            Uri baseUrl = new Uri(absoluteAddress);
            Uri webrul = new Uri(baseUrl, relativeAddress);
            return webrul.ToString();
        }
        /// <summary>
        /// 替换所有HTML标签为空
        /// </summary>
        /// <param name="input">The string whose values should be replaced.</param>
        /// <returns>A string.</returns>
        public static string RemoveHtml(this string input)
        {
            var stripTags = new Regex("</?[a-z][^<>]*>", RegexOptions.IgnoreCase);
            return stripTags.Replace(input, string.Empty);
        }
    }
}
posted @ 2011-12-15 13:12 清山博客阅读(497) 评论(0) 收藏举报
刷新页面返回顶部
清山博客

慎言其余，则寡尤；多见阙殆，慎行其余，则寡悔。言寡尤，行寡悔。

CollectionHelper-网页采集辅助类

公告