根据当前页面url匹配出页面内链接地址的完整路径
该类其功能相当于浏览器对当前页面内链接地址的解析功能
主要方法是 字符串的操作,正则表达式的匹配和替换。
比如所当前页为:http://www.test.com/list/page.aspx?id=12,其页面内链接为:
| BaseUrl | Result |
| /default.aspx?id=14 | http://www.test.com/default.aspx?id=14 |
| ../details.aspx?id=4 | http://www.test.com/details.aspx?id=4 |
| dete.aspx | http://www.test.com/list/dete.aspx |
该类C#代码:
using System;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
public class Utility
{
/// <summary>
/// 处理URL地址,当BranchUrl为一个全名的URL时则返回本身,否则恰当的衔接到BaseUrl后面
/// </summary>
/// <param name="BaseUrl">完整的URL http://www.test.com/list/page.aspx </param>
/// <param name="BranchUrl">分支URL ../test.aspx</param>
/// <returns></returns>
public static string StickUrl(string BaseUrl, string BranchUrl)
{
//如果BranchUrl完整路径则返回
if (Regex.Match(BranchUrl, @"^(http|https|ftp|rtsp|mms)://", RegexOptions.IgnoreCase | RegexOptions.Compiled).Success)
{
return BranchUrl;
}
else
{
BaseUrl = BaseUrl.Replace("\\", "/");
BranchUrl = BranchUrl.Replace("\\", "/");
//如果BranchUrl相对路径为根目录 ep:“/test.aspx”
if (BranchUrl.Trim().IndexOf("/") == 0)
{
return GetLastUrl(BaseUrl, BranchUrl);
}
//开头不是 "../"
if (BranchUrl.IndexOf("../") != 0)
{
return UrlPlus(BaseUrl, BranchUrl);
}
else
{
//各种正则匹配
if (Regex.Match(BaseUrl, @"/$", RegexOptions.Compiled).Success)
{
BaseUrl = BaseUrl.TrimEnd('/');
}
else if (Regex.Match(BaseUrl, @"/[^\./]+\.[^/]+$", RegexOptions.Compiled).Success)
{
BaseUrl = Regex.Replace(BaseUrl, @"/[^\./]+\.[^/]+$", "", RegexOptions.Compiled);
}
while (BranchUrl.IndexOf("../") >= 0)
{
BranchUrl = Regex.Replace(BranchUrl, @"^\.\./", "", RegexOptions.Compiled);
BaseUrl = Regex.Replace(BaseUrl, @"/[^/]*$", "", RegexOptions.Compiled);
break;
}
return BaseUrl + "/" + BranchUrl;
}
}
}
/// <summary>
/// BranchUrl相对路径为根目录 "/test.aspx"
/// </summary>
/// <param name="BaseUrl">当前页面地址 http://www.test.com/list/page.aspx</param>
/// <param name="BranchUrl">页面内链接地址 "/test.aspx"</param>
/// <returns></returns>
private static string GetLastUrl(string BaseUrl, string BranchUrl)
{
BranchUrl = BranchUrl.TrimStart('/'); //移除 "/test.aspx" 中根目录符号"/"
string Star_url = "";
string End_Url = BaseUrl;
//如果包含协议类型
if (BaseUrl.IndexOf("//") > 0)
{
BaseUrl = BaseUrl.Replace("//", "|"); // http:|www.test.com/list/page.aspx
string[] Url_Arr = BaseUrl.Split('|'); //分割数组
Star_url = Url_Arr[0].ToString(); //协议类型:“http:”
End_Url = Url_Arr[1].ToString(); //域名地址:www.test.com/list/page.aspx
}
if (End_Url.IndexOf("/") > 0)
{
string[] End_Arr = End_Url.Split('/');//分割数组 :www.test.com/list/page.aspx
End_Url = End_Arr[0].ToString(); //域名:www.test.com
}
if (Star_url != string.Empty)//协议类型不为空
{
return Star_url + "//" + End_Url + "/" + BranchUrl; // http: + // + www.test.com + / + BranchUrl
}
else
{
return End_Url + "/" + BranchUrl; // www.test.com + / + BranchUrl
}
}
/// <summary>
/// BranchUrl相对路径包为 单独页面 ep "test.aspx"
/// </summary>
/// <param name="front">当前页面地址 ep http://www.test.com/list/page.aspx </param>
/// <param name="tail">页面内链接地址 "test.aspx"</param>
/// <returns></returns>
private static string UrlPlus(string front, string tail)
{
//判断各种不同的当前页面地址
if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
{
return front + "/" + tail;
}
else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
{
return front + tail;
}
else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://.+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
{
return front + tail;
}
else if (Regex.Match(front, @"/[^/\.]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
{
return front + "/" + tail;
}
else if (Regex.Match(front, @"/[^/\.]+\.[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
{
return Regex.Replace(front, @"/[^/\.]+\.[^/]+$", "", RegexOptions.IgnoreCase | RegexOptions.Compiled) + "/" + tail;
}
else
{
return front + "/" + tail;
}
}
}
主要功能借鉴于:dotNETCMSv1.0sp5 CMS。源码下载地址:http://www.51aspx.com/CV/dotNETCMS10sp5
在数据采集的时候,在文章列表页中匹配出文章内容页的完整路径。

浙公网安备 33010602011771号