正则表达式附取一个网页上所有a标签的href属性和innerHTML

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;

namespace 正则表达式1
{
    class Program
    {
        static void Main(string[] args)
        {
            /*================================================实验1=========================================*/
            //如果想匹配开始结束符，不要忘了 "^"，"$"
            //c#中匹配表达式前+“@”
            //Console.WriteLine(Regex.IsMatch("abcccccde","^abc*de$"));
            //练习1，匹配合法的邮政编码（6位数字）
            //string str=Console.ReadLine();
            //由0~9组成的6位的编码，开始结束匹配
            //Console.WriteLine(Regex.IsMatch(str, "^[0-9]{6}$"));
            //Console.WriteLine(Regex.IsMatch(str, @"^\d{6}$"));
            //练习2，判断一个字符是不是身份证号，15位或18位
            //while (true)
            //{
            //    string str = Console.ReadLine();
            //    Console.WriteLine(Regex.IsMatch(str, @"^(\d{15}|\d{18})$"));//正确表示
            //    Console.WriteLine(Regex.IsMatch(str, @"^\d{15}|\d{18}$"));//错误表示能15以上个字符
            //    Console.WriteLine(Regex.IsMatch(str, @"^(\d{15}|\d{18}|\d{17}[xX])$"));//正确表示并匹配17位+X
            //}
            //练习3，匹配国内的电话号码如 010-955555 010955555 9555555 都是正确的电话号码
            //while (true)
            //{
            //    string str = Console.ReadLine();
            //    Console.WriteLine(Regex.IsMatch(str, @"^(\d{3,4})?\-?\d{5}$"));//正确表示
            //}
            //练习4，判断一个字符串是否为合法的Email地址
            //while (true)
            //{
            //    string str = Console.ReadLine();
            //    Console.WriteLine(Regex.IsMatch(str, @"^\w+@\w+\.\w+$"));//正确表示
            //    Console.WriteLine(Regex.IsMatch(str, @"^\S+@\S+\.\S+$"));//正确表示
            //    Console.WriteLine(Regex.IsMatch(str, @"^.+@.+\..+$"));//错误表示 “。”还可包括空白字符可验证“zha ff@12 3.com”
            //}
            //练习5，匹配正确的IP地址，192.168.0.1
            //while (true)
            //{
            //    string str = Console.ReadLine();
            //    Console.WriteLine(Regex.IsMatch(str, @"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$"));//正确表示
            //}
            //练习6，匹配是否是合法的日期格式 2008-08-24 四位数字-两位数字-两位数字
            //while (true)
            //{
            //    string str = Console.ReadLine();
            //    Console.WriteLine(Regex.IsMatch(str, @"^\d{4}\-\d{2}\-\d{2}$"));//正确表示
            //}
            //练习7，匹配正确的URL 如： http://www.baidu.com/a.htm   ftp://127.0.0.1/1.txt 字符串序列：//字符串序列
            //while (true)
            //{
            //    string str = Console.ReadLine();
            //    Console.WriteLine(Regex.IsMatch(str, @"^[a-z]{1,5}://.+$"));//正确表示特别注意 \w不能匹配“,”
            //}
            /*=================================================实验2===============================================*/
            //练习1，字符串的提取
            //(\w+)中括号不能少（表达式）表示要查找的分组
            //Match match = Regex.Match("老王的英文名字是Wrong", @"(\w+)的英文名字是(\w+)");
            ////是否匹配成功
            //if (match.Success)
            //{
            //    //匹配序号从一开始
            //    string cnName = match.Groups[1].Value;
            //    string enName = match.Groups[2].Value;
            //    Console.WriteLine(cnName);
            //    Console.WriteLine(enName);
            //}
            //练习2，从“June 26,1951”
            //匹配表达式要写正确，一点小错误，可能导致匹配不到值
            //Match match = Regex.Match("June 26,1951", @"^([a-zA-Z]+)\s+\d{1,2},\s*\d{1,4}");//不能少了“\s+” 不然的话会匹配不出正确的值
            //if (match.Success)
            //{
            //    Console.WriteLine(match.Groups.Count);
            //    string Month = match.Groups[1].Value;
            //    Console.WriteLine(Month);
            //}
            //练习3，从Email中提取用户名和密码出来zhang@163.com   提出：zhang   163.com
            //string str=Console.ReadLine();
            //Match match = Regex.Match(str, @"(\w+)@(\w+\.\w+)");//(.+)@(.+\..+)参考答案
            //if (match.Success)
            //{
            //    string UserName = match.Groups[1].Value;
            //    string ServerName = match.Groups[2].Value;
            //    Console.WriteLine(UserName);
            //    Console.WriteLine(ServerName);
            //}
            //else
            //    Console.WriteLine("匹配错误！");
            //练习4，192.168.0.1【port=21,type=ftp】 ip地址为192.168.0.1的服务器的21端口提供ftp服务，其中后面的type不写的话，默认是http   查找出ip 端口服务
            //string str = Console.ReadLine();
            //Match match = Regex.Match(str, @"([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})\[port=(\d+)(,type=)?([a-zA-Z]+)?]?");
            //if (match.Success)
            //{
            //    Console.WriteLine(match.Groups.Count);
            //    string IP = match.Groups[1].Value;
            //    string port = match.Groups[2].Value;
            //    string type = "http";
            //    if (match.Groups[4].Value.Length!=0)
            //    {
            //        type =match.Groups[4].Value ;
            //    }
            //    Console.WriteLine("IP:"+IP);
            //    Console.WriteLine("port:"+port);
            //    Console.WriteLine("type:"+type);
            //}
            //else
            //    Console.WriteLine("匹配错误！");
            /*=================================================实验3===============================================*/
            // 贪婪模式与非贪婪模式
            //一般不会刻意去在乎贪婪与非贪婪，一般测试的时候，遇到贪婪问题了，再去解决，因为贪婪效率高
            //Match match = Regex.Match("大家好，我是S.H.E。我病了，呜呜。天下难容我啊。",@"我是(.+)。");
            //if (match.Success)
            //{
            //    string value = match.Groups[1].Value;
            //    Console.WriteLine(value);//输入结果：“S.H.E。我病了，呜呜。天下难容我啊”
            //    //因为 +、*是贪婪的，直再贪婪一下后续就无法匹配为止
            //}
            ////解决贪婪模式（非贪婪模式）   在+、*后面加?   就可实现不贪婪   表示尽可能早的让后面的模式匹配
            //Match match1 = Regex.Match("大家好，我是S.H.E。我病了，呜呜。天下难容我啊。", @"我是(.+?)。");
            //if (match1.Success)
            //{
            //    string value = match1.Groups[1].Value;
            //    Console.WriteLine(value);//输入结果：“S.H.E。”
            //    //因为 +、*是贪婪的，直再贪婪一下后续就无法匹配为止
            //}
            /*=================================================实验4===============================================*/
            //匹配组
            //可以这样理解：先分组，再匹配
            //匹配 “你是jerry，我是tom，他是韩某某，哈哈”
            //string str = "你是jerry，我是tom，他是韩某某，哈哈";
            //MatchCollection mc= Regex.Matches(str, @"是(\w+)，");
            ////匹配组序号从0开始
            //for (int i = 0; i < mc.Count; i++)
            //{
            //    //先MatchCollection序列中取出match，然后进行输出
            //    Match match = mc[i];
            //    //先输出要匹配的match，用于测试   例: 是tom,   match.value 是匹配的内容
            //    Console.WriteLine(match.Value);
            //    //输出匹配结果值   例: tom
            //    Console.WriteLine(match.Groups[1].Value);
            //}
            /*=================================================实验5===============================================*/
            //测试一个匹配组的实例
            //把一个网页的内容保存，然后取出所有的超链接的href和链接文字
            //string str = File.ReadAllText(@"c:\abc.htm");
            //MatchCollection mc = Regex.Matches(str, "<a.+?href=\"(.+?)\".*>(.+)</a>");
            ////匹配组序号从0开始
            //string str1="";
            //for (int i = 0; i < mc.Count; i++)
            //{
            //    //先MatchCollection序列中取出match，然后进行输出
            //    Match match = mc[i];

            //    str1 += "HREF:" + match.Groups[1].Value + "     " + match.Groups[2].Value + "\n";
            //    //把取出的结果，写到一个文件中
            //    File.WriteAllText("c:\\111.txt",str1);

//}

            /*=================================================实验6===============================================*/
            //String对象与Regex对象的replace函数的比较
            //String对象的replace函数
            //string str = "我是张三，你是李四，他是王五";
            //string str1=str.Replace("是", "系");//使用replace函数时有一点小注意，他不是对原有字符串直接更改，
            //Console.WriteLine(str1);
            //使用String对象replace只能进行简单的操作，不能进行大规模的替换
            //Regex的replace函数
            //把连续的空格替换成一个空格
            //string str = "I'm from    in    china,and                   you?";
            //string str1 = Regex.Replace(str, "\\s+", " ");
            //Console.WriteLine(str1);//"I'm from in china,and you?"
            //如果匹配中有group,还可以在替换字符串中用$number来进行引用替换
            //$number是从1开始的
            //string str = "name=zhangshan age=15 id=10";
            //Console.WriteLine(Regex.Replace(str, "(\\w+)=(\\w+)", "$2是$1"));//zhangshan是name 15是age 10是id
            //将一段文字的日期格式 yyyy/mm/dd 转换为：yyyy-mm-dd
            //string str = "我的出生日期是2010/5/21,你的出生日期是2012/02/3,他的呢?";
            //Console.WriteLine(Regex.Replace(str, @"(\d{4})/(\d{1,2})/(\d{1,2})", "$1-$2-$3"));//我的出生日期是2010-5-21,你的出生日期是2012-02-3,他的呢?
            //把一段文字中找到的超链接变换成a标签
            //string str = "我找到一个下载音乐的网址http://www.hao123.com，还有一个百度网址http://www.baidu.com，挺好玩的";
            //Console.WriteLine(Regex.Replace(str, @"(http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?)", "<a href=\"$1\">$1</a>"));//http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? w3c提供的url测试中文连着有时候会有点小错误

            //网站论坛 UBB翻译
            //[URL=http://www.baidu.com]百度网[/URL]替换成a标签
            //[IMG]http://www.sina.com/1.jpg [/IMG] 替换成img标签
            //string str = "百度网[URL=http://www.baidu.com]百度网[/URL]，图片是[IMG]http://www.sina.com/1.jpg[/IMG]，支持国产";
            //str=Regex.Replace(str, @"\[URL=(.+)\](.+)\[/URL\]", "<a href=\"$1\">$2</a>");
            //str = Regex.Replace(str, @"\[IMG\](.+)\[/IMG\]", "<img src=\"$1\" />");
            //Console.WriteLine(str);

            //敏感词处理
            //string fileappach = Environment.CurrentDirectory;//D:\c\.net高级编程\正则表达式1\bin\Debug
            //fileappach = Regex.Replace(fileappach, @"bin\\Debug", "");
            //string[] lines = File.ReadAllLines(fileappach + "TextFile.txt");
            //string str = "*****主席热情参观了湖南涉外，说涉外真他妈牛逼。";
            //string str1 = Regex.Replace(str, @"[^\u4E00-\u9FA5]", "");//[^\u4E00-\u9FA5]匹配非中文
            //用contains实现
            //foreach (string s in lines)
            //{
            //    string[] line = s.Split('|');
            //    string sym = line[0];
            //    string mingan = line[1];
            //    if (sym == "f" && str1.Contains(mingan))//contains效率太低，尽量使用正则表达式
            //    {
            //        Console.WriteLine("你发送的词语含有敏感词："+mingan);
            //        Console.ReadKey();
            //        return;
            //    }
            //    else if (sym == "m")
            //    {
            //        str = str.Replace(mingan, "**");
            //    }
            //}
            //用正则表达式执行
            //string sym = "";
            //string word = "";
            //foreach (string s in lines)
            //{
            //    string[] line = s.Split('|');
            //    string l1 = line[0];
            //    string l2 = line[1];
            //    if (l1 == "f")
            //    {
            //        word += l2 + '|';
            //    }
            //    else
            //        sym += l2 + '|';

            //}
            //word = Regex.Replace(word, @"\|$", "");
            //sym = Regex.Replace(sym, @"\|$", "");
            //if (Regex.IsMatch(str1, word))
            //{
            //    Console.WriteLine("含有敏感词，请重新输入");
            //    Console.ReadKey();
            //    return;
            //}
            //str = Regex.Replace(str,sym,"**");

            //Console.WriteLine(str+"            发送成功！");

            //实例
            //取51job网站中的工作信息，如工作名字，公司名称，地点
            //string str = File.ReadAllText(@"c:\search.htm",Encoding.Default);
            ////下面的正则表达式，要根据每个网页不同而不同，如果我们做一个这样的功能，首先，用工具分析一个他的html代码找出你要查找模式，用正则表达式去套（一般像这样条目的选项，都是规律的）
            //MatchCollection mc = Regex.Matches(str, "<td class=\"td1\"><a\\shref=\"(.+?)\".+>(.+?)</a>.+\\s+<td class=\"td2\"><a.+?>(.+)</a>.+\\s+<td class=\"td3\"><span\\s.+?>(.+?)</span>", RegexOptions.Multiline);
            ////匹配组序号从0开始
            //string str1 = "";
            //for (int i = 0; i < mc.Count; i++)
            //{
            //    //先MatchCollection序列中取出match，然后进行输出
            //    Match match = mc[i];

            //    str1 += "HREF:" + match.Groups[1].Value + "     " + match.Groups[2].Value + "      " + match.Groups[3].Value + "      " + match.Groups[4].Value + "\n";
            //    //把取出的结果，写到一个文件中
            //    File.WriteAllText("c:\\222.txt", str1);
            //}
            //if (mc.Count <= 0)
            //{
            //    Console.WriteLine("匹配失败");
            //}

            //正则结语：以前没看过正则表达式，觉得有些处理起字符串太离谱了，自己看了这个正则表达式，呵呵，太方便了。

            Console.ReadKey();
        }
    }
}

posted @ 2012-03-15 12:52 话里阅读(1327) 评论(0) 收藏举报

刷新页面返回顶部

正则表达式 附取一个网页上所有a标签的href属性和innerHTML

公告

正则表达式附取一个网页上所有a标签的href属性和innerHTML