C#获取博客园的标题

http://www.jb51.net/tools/zhengze.html

Regex reg = new Regex(@"<li><a href=""(?<url>[^""]*)"" target=""_blank"">(?<title>[^<]*)</a></li>");
            string html = "<div>sadfasdfasd</div> <div class=\"video_1_left\"> <UL> <li><a href=\"/news/12718.html\" target=\"_blank\">标题sadfasdfasdfasdf</a></li> <li><a href=\"/news/12710.html\" target=\"_blank\">标题asdfasdfasdf</a></li> <li><a href=\"/news/12729.html\" target=\"_blank\">v2sdfasdf</a></li> <li><a href=\"/news/12728.html\" target=\"_blank\">标题sdfsadf</a></li> </UL> </div> <div class=\"video_1_right\"> <UL> <li><a href=\"/news/12705.html\" target=\"_blank\">标题xxxfasdfasdfx</a></li> <li><a href=\"/news/12737.html\" target=\"_blank\">标题xxxdfasdfasax</a></li> </UL> </div> <div>sadfasdfasd</div> ";
            foreach (Match m in reg.Matches(html))
            {
                Console.WriteLine(m.Groups["url"].Value + "\t" + m.Groups["title"].Value);
            }

指定表达式的组名：(?<Word>\w+)，这样就把\w+的组名指定为Word了

匹配exp,并捕获文本到名称为name的组里，(?<name>exp)

设置webapi，部署在IIS上

public HttpResponseMessage GetGrabNews()
        {
            
            NewsManage news = new NewsManage();
            return toJson(news.GrabNews("http://www.cnblogs.com/"));
        }

public HttpResponseMessage toJson(Object obj)
        {
            string str = "";
            if (obj is String || obj is Char)
            {
                str = obj.ToString();
            }
            else
            {
                str = JsonConvert.SerializeObject(obj);
            }
            HttpResponseMessage result = new HttpResponseMessage { Content = new StringContent(str, Encoding.GetEncoding("UTF-8"), "application/json") };
            result.Content.Headers.ContentType.MediaType = "text/plain";
            result.Content.Headers.Add("Access-Control-Allow-Origin", "*");//解决跨域问题
            return result;
        }

public List<T_News> GrabNews(string url)
        {//后台逻辑（1）
            List<T_News> newsList = new List<T_News>();
            HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);//获取请求连接
            req.Method = "get";
            req.ContentType = "text/html;chartset=utf-8";
            StringBuilder sb = new StringBuilder();
            using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
            {
                Stream respStream = wr.GetResponseStream();
                StreamReader reader = new StreamReader(respStream, Encoding.GetEncoding("UTF-8"));
                do
                {
                    sb.Append(reader.ReadLine());
                } while (!reader.EndOfStream);
                newsList=AnalysisHtml(sb.ToString());
            }
            return newsList;
        }

public List<T_News> AnalysisHtml(string htmlContent)
        {
            //string strPattern = "<div\\s*class=\"post_item\">\\s*.*\\s*.*\\s*.*\\s*.*\\s*.*\\s*.*\\s*.*\\s*<div\\s*class=\"post_item_body\">\\s*<h><a\\s*class=\"titlelnk\"\\s*href=\"(?<href>.*)\"\\s*target=\"_blank\">(?<title>.*)</a>.*\\s*<p\\s*class=\"post_item_summary\">\\s*(?<content>.*)\\s*</p>";
            string strPattern = "<a\\s*class=\"titlelnk\"\\s*href=\"http://www.cnblogs.com/(?<href>[^\"\"]*)\"\\s*target=\"_blank\">(?<title>[^<]*)</a>";
　　　　　　　//正则表达式，（1）href，title是组名称，(?<href>[^""]*)，表示除了""外所有字符串，(?<title>[^<]*)获取所有不是<开头的字符串；（2）\s*表示匹配任意的空白符，*表示多个
            //Regex regex = new Regex(strPattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant);
            Regex regex = new Regex(strPattern, RegexOptions.IgnoreCase);
            List<T_News> newsList = new List<T_News>();
            if (regex.IsMatch(htmlContent))
            {
                MatchCollection matchCollection = regex.Matches(htmlContent);
                foreach (Match match in matchCollection)
                {
                    //string title = match.Groups[].Value;//获取到的是列表数据的标题
                    //string content = match.Groups[].Value;//获取到的是内容
                    //string linkurl=match.Groups[].Value;//获取到的是链接到的地址
                    newsList.Add(new T_News { Title=match.Groups["title"].Value, Content=match.Groups["href"].Value});//根据组名获取title，href值
                }
            }
　　　　　　　//自定义，实例
            Regex reg = new Regex(@"<li><a href=""(?<url>[^""]*)"" target=""_blank"">(?<title>[^<]*)</a></li>");//url，title表示2个组名，(?<url>[^""]*)匹配不是""开头的字符串""开头说明字符串已经结束，(?<title>[^<]*)表示不是<开头的字符串，如果<开头说明a标签已经结束；
            string html = "<div>sadfasdfasd</div> <div class=\"video_1_left\"> <UL> <li><a href=\"/news/12718.html\" target=\"_blank\">标题sadfasdfasdfasdf</a></li> <li><a href=\"/news/12710.html\" target=\"_blank\">标题asdfasdfasdf</a></li> <li><a href=\"/news/12729.html\" target=\"_blank\">v2sdfasdf</a></li> <li><a href=\"/news/12728.html\" target=\"_blank\">标题sdfsadf</a></li> </UL> </div> <div class=\"video_1_right\"> <UL> <li><a href=\"/news/12705.html\" target=\"_blank\">标题xxxfasdfasdfx</a></li> <li><a href=\"/news/12737.html\" target=\"_blank\">标题xxxdfasdfasax</a></li> </UL> </div> <div>sadfasdfasd</div> ";
            foreach (Match m in reg.Matches(html))
            {
               // Console.WriteLine(m.Groups["url"].Value + "\t" + m.Groups["title"].Value);
            }
            return newsList;
        }

客户端调用webapi接口

$.get("http://10.100.22.54:8095/api/NewsManager/GetGrabNews", function (data) {
                var $ul = $(".list-ul"); $ul.empty();
                $.each(data, function (key, value) {
                    var title = value.Title;
                    //if (title.length > 23) { title = title.substring(0, 20) + "...";}
                    $ul.append('<li class="list-li"><div class="con" data-href=' + value.Content + '>'
                        + (title.gblen() > 43 ? title.gbsub(40) : title)
                        + '<div class="btn" data-id='+value.ID+'>删除</div>'
                        +'</li></div>');
                })
            },"json")

posted @ 2016-07-11 19:31 wjl910 阅读(109) 评论(0) 收藏举报

刷新页面返回顶部

wjl910

C#获取博客园的标题

公告