将Html代码转换为Text
在抓取html页时,需要过滤掉html代码,获取Html源代码中的Text,有正则表达式可以解决这个问题:
VB.NET
''' -----------------------------------------------------------------------------
''' <summary>
''' 移除所有的html标签
''' </summary>
''' <param name="HTML">html代码</param>
''' <returns></returns>
''' <remarks>
''' </remarks>
''' <history>
''' [Administrator] 2004-9-25 Created
''' </history>
''' -----------------------------------------------------------------------------
Public Function ParseTags(ByVal HTML As String) As String
' 使用正则表达式识别并移除所有的html标签,返回过滤掉Html标签的文本
Dim objRegEx As System.Text.RegularExpressions.Regex
Return objRegEx.Replace(HTML, "<[^>]*>", "")
End Function
C#
/// <summary>
/// 移除所有的html标签
/// </summary>
/// <param name="HTML">html源代码</param>
/// <returns></returns>
public string ParseTags(string HTML)
{
return System.Text.RegularExpressions.Regex.Replace(HTML, "<[^>]*>", "");
}
VB.NET
''' -----------------------------------------------------------------------------
''' <summary>
''' 移除所有的html标签
''' </summary>
''' <param name="HTML">html代码</param>
''' <returns></returns>
''' <remarks>
''' </remarks>
''' <history>
''' [Administrator] 2004-9-25 Created
''' </history>
''' -----------------------------------------------------------------------------
Public Function ParseTags(ByVal HTML As String) As String
' 使用正则表达式识别并移除所有的html标签,返回过滤掉Html标签的文本
Dim objRegEx As System.Text.RegularExpressions.Regex
Return objRegEx.Replace(HTML, "<[^>]*>", "")
End FunctionC#
/// <summary>
/// 移除所有的html标签
/// </summary>
/// <param name="HTML">html源代码</param>
/// <returns></returns>
public string ParseTags(string HTML)
{
return System.Text.RegularExpressions.Regex.Replace(HTML, "<[^>]*>", "");
}



浙公网安备 33010602011771号