C#获取博客园的标题

http://www.jb51.net/tools/zhengze.html

Regex reg = new Regex(@"<li><a href=""(?<url>[^""]*)"" target=""_blank"">(?<title>[^<]*)</a></li>");
            string html = "<div>sadfasdfasd</div> <div class=\"video_1_left\"> <UL> <li><a href=\"/news/12718.html\" target=\"_blank\">标题sadfasdfasdfasdf</a></li> <li><a href=\"/news/12710.html\" target=\"_blank\">标题asdfasdfasdf</a></li> <li><a href=\"/news/12729.html\" target=\"_blank\">v2sdfasdf</a></li> <li><a href=\"/news/12728.html\" target=\"_blank\">标题sdfsadf</a></li> </UL> </div> <div class=\"video_1_right\"> <UL> <li><a href=\"/news/12705.html\" target=\"_blank\">标题xxxfasdfasdfx</a></li> <li><a href=\"/news/12737.html\" target=\"_blank\">标题xxxdfasdfasax</a></li> </UL> </div> <div>sadfasdfasd</div> ";
            foreach (Match m in reg.Matches(html))
            {
                Console.WriteLine(m.Groups["url"].Value + "\t" + m.Groups["title"].Value);
            }

 

指定表达式的组名:(?<Word>\w+),这样就把\w+的组名指定为Word了

匹配exp,并捕获文本到名称为name的组里,(?<name>exp)

设置webapi,部署在IIS上

public HttpResponseMessage GetGrabNews()
        {
            
            NewsManage news = new NewsManage();
            return toJson(news.GrabNews("http://www.cnblogs.com/"));
        }

 

public HttpResponseMessage toJson(Object obj)
        {
            string str = "";
            if (obj is String || obj is Char)
            {
                str = obj.ToString();
            }
            else
            {
                str = JsonConvert.SerializeObject(obj);
            }
            HttpResponseMessage result = new HttpResponseMessage { Content = new StringContent(str, Encoding.GetEncoding("UTF-8"), "application/json") };
            result.Content.Headers.ContentType.MediaType = "text/plain";
            result.Content.Headers.Add("Access-Control-Allow-Origin", "*");//解决跨域问题
            return result;
        }
public List<T_News> GrabNews(string url)
        {//后台逻辑(1)
            List<T_News> newsList = new List<T_News>();
            HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);//获取请求连接
            req.Method = "get";
            req.ContentType = "text/html;chartset=utf-8";
            StringBuilder sb = new StringBuilder();
            using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
            {
                Stream respStream = wr.GetResponseStream();
                StreamReader reader = new StreamReader(respStream, Encoding.GetEncoding("UTF-8"));
                do
                {
                    sb.Append(reader.ReadLine());
                } while (!reader.EndOfStream);
                newsList=AnalysisHtml(sb.ToString());
            }
            return newsList;
        }
public List<T_News> AnalysisHtml(string htmlContent)
        {
            //string strPattern = "<div\\s*class=\"post_item\">\\s*.*\\s*.*\\s*.*\\s*.*\\s*.*\\s*.*\\s*.*\\s*<div\\s*class=\"post_item_body\">\\s*<h><a\\s*class=\"titlelnk\"\\s*href=\"(?<href>.*)\"\\s*target=\"_blank\">(?<title>.*)</a>.*\\s*<p\\s*class=\"post_item_summary\">\\s*(?<content>.*)\\s*</p>";
            string strPattern = "<a\\s*class=\"titlelnk\"\\s*href=\"http://www.cnblogs.com/(?<href>[^\"\"]*)\"\\s*target=\"_blank\">(?<title>[^<]*)</a>";
       //正则表达式,(1)href,title是组名称,(?<href>[^""]*),表示除了""外所有字符串,(?<title>[^<]*)获取所有不是<开头的字符串;(2)\s*表示匹配任意的空白符,*表示多个
//Regex regex = new Regex(strPattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant); Regex regex = new Regex(strPattern, RegexOptions.IgnoreCase); List<T_News> newsList = new List<T_News>(); if (regex.IsMatch(htmlContent)) { MatchCollection matchCollection = regex.Matches(htmlContent); foreach (Match match in matchCollection) { //string title = match.Groups[].Value;//获取到的是列表数据的标题 //string content = match.Groups[].Value;//获取到的是内容 //string linkurl=match.Groups[].Value;//获取到的是链接到的地址 newsList.Add(new T_News { Title=match.Groups["title"].Value, Content=match.Groups["href"].Value});//根据组名获取title,href值 } }        //自定义,实例 Regex reg = new Regex(@"<li><a href=""(?<url>[^""]*)"" target=""_blank"">(?<title>[^<]*)</a></li>");//url,title表示2个组名,(?<url>[^""]*)匹配不是""开头的字符串""开头说明字符串已经结束,(?<title>[^<]*)表示不是<开头的字符串,如果<开头说明a标签已经结束; string html = "<div>sadfasdfasd</div> <div class=\"video_1_left\"> <UL> <li><a href=\"/news/12718.html\" target=\"_blank\">标题sadfasdfasdfasdf</a></li> <li><a href=\"/news/12710.html\" target=\"_blank\">标题asdfasdfasdf</a></li> <li><a href=\"/news/12729.html\" target=\"_blank\">v2sdfasdf</a></li> <li><a href=\"/news/12728.html\" target=\"_blank\">标题sdfsadf</a></li> </UL> </div> <div class=\"video_1_right\"> <UL> <li><a href=\"/news/12705.html\" target=\"_blank\">标题xxxfasdfasdfx</a></li> <li><a href=\"/news/12737.html\" target=\"_blank\">标题xxxdfasdfasax</a></li> </UL> </div> <div>sadfasdfasd</div> "; foreach (Match m in reg.Matches(html)) { // Console.WriteLine(m.Groups["url"].Value + "\t" + m.Groups["title"].Value); } return newsList; }

 

客户端调用webapi接口

$.get("http://10.100.22.54:8095/api/NewsManager/GetGrabNews", function (data) {
                var $ul = $(".list-ul"); $ul.empty();
                $.each(data, function (key, value) {
                    var title = value.Title;
                    //if (title.length > 23) { title = title.substring(0, 20) + "...";}
                    $ul.append('<li class="list-li"><div class="con" data-href=' + value.Content + '>'
                        + (title.gblen() > 43 ? title.gbsub(40) : title)
                        + '<div class="btn" data-id='+value.ID+'>删除</div>'
                        +'</li></div>');
                })
            },"json")

 

posted @ 2016-07-11 19:31  wjl910  阅读(109)  评论(0)    收藏  举报