using System;
using System.Text.RegularExpressions;
/// <summary>
/// NoFollow contains the functionality to add rel=nofollow to unstusted links
/// </summary>
public static class NoFollow
{
// the white list of domains (in lower case)
private static string[] whitelist =
{ "seoasp", "www.seoegghead.com", "www.cristiandarie.ro" };
// finds all the links in the input string and processes them using fixLink
public static string FixLinks(string input)
{
// define the match evaluator
MatchEvaluator fixThisLink = new MatchEvaluator(NoFollow.fixLink);
// fix the links in the input string
string fixedInput = Regex.Replace(input,
"(<a.*?>)",
fixThisLink,
RegexOptions.IgnoreCase);
// return the "fixed" input string
return fixedInput;
}
// receives a Regex match that contains a link such as
// <a href="http://too.much.spam/"> and adds ref=nofollow if needed
private static string fixLink(Match linkMatch)
{
// retrieve the link from the received Match
string singleLink = linkMatch.Value;
// if the link already has rel=nofollow, return it back as it is
if (Regex.IsMatch(singleLink,
@"rel\s*?=\s*?['""]?.*?nofollow.*?['""]?",
RegexOptions.IgnoreCase))
{
return singleLink;
}
// use a named group to extract the URL from the link
Match m = Regex.Match(singleLink,
@"href\s*?=\s*?['""]?(?<url>[^'""]*)['""]?",
RegexOptions.IgnoreCase);
string url = m.Groups["url"].Value;
// if URL doesn't contain http://, assume it's a local link
if (!url.Contains("http://"))
{
return singleLink;
}
// extract the host name (such as www.cristiandarie.ro) from the URL
Uri uri = new Uri(url);
string host = uri.Host.ToLower();
// if the host is in the whitelist, don't alter it
if (Array.IndexOf(whitelist, host) >= 0)
{
return singleLink;
}
// if the URL already has a rel attribute, change its value to nofollow
string newLink = Regex.Replace(singleLink,
@"(?<a>rel\s*=\s*(?<b>['""]?))((?<c>[^'""\s]*|[^'""]*))(?<d>['""]?)?",
"${a}nofollow${d}",
RegexOptions.IgnoreCase);
// if the string had a rel attribute that we changed, return the new link
if (newLink != singleLink)
{
return newLink;
}
// if we reached this point, we need to add rel=nofollow to our link
newLink = Regex.Replace(singleLink, "<a", @"<a rel=""nofollow""",
RegexOptions.IgnoreCase);
return newLink;
}
}