htmlagilitypack - Eliminando todas las etiquetas html con Html Agility Pack
html parser c# (5)
¿Por qué no simplemente devolver htmldoc.DocumentNode.InnerText
lugar de eliminar todos los nodos que no son de texto? Debería darte lo que quieres.
Tengo una cadena html como esta:
<html><body><p>foo <a href=''http://www.example.com''>bar</a> baz</p></body></html>
Deseo eliminar todas las etiquetas html para que la cadena resultante se convierta en:
foo bar baz
De otra publicación aquí en SO, se me ocurrió esta función (que utiliza el paquete de agilidad de HTML):
Public Shared Function stripTags(ByVal html As String) As String
Dim plain As String = String.Empty
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html)
Dim invalidNodes As HtmlAgilityPack.HtmlNodeCollection = htmldoc.DocumentNode.SelectNodes("//html|//body|//p|//a")
If Not htmldoc Is Nothing Then
For Each node In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmldoc.DocumentNode.WriteContentTo
End Function
Lamentablemente esto no devuelve lo que espero, sino que da:
bazbarfoo
Por favor, ¿dónde me equivoco? ¿Es este el mejor enfoque?
Saludos y feliz codificación!
ACTUALIZACIÓN: por la respuesta a continuación se me ocurrió esta función, podría ser útil para otros:
Public Shared Function stripTags(ByVal html As String) As String
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html.Replace("</p>", "</p>" & New String(Environment.NewLine, 2)).Replace("<br/>", Environment.NewLine))
Return htmldoc.DocumentNode.InnerText
End Function
Edita debajo de unas pocas líneas, entonces obtienes lo que quieres ..
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlAgilityPack.HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
''------- edit this line -------------------
''For Each node As HtmlNode In invalidNodes
''node.ParentNode.RemoveChild(node, True)
''Next
''
'' result-> bazbarfoo
''
''------- modify line ----------------------
For i = invalidNodes.Count - 1 To 0 Step -1
Dim Node As HtmlNode = invalidNodes.Item(i)
Node.ParentNode.RemoveChild(Node, True)
Next
''
'' result-> foo bar baz
''
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
Elimina las etiquetas y propiedades que no se encuentran en la lista blanca.
Public NotInheritable Class HtmlSanitizer
Private Sub New()
End Sub
Private Shared ReadOnly Whitelist As IDictionary(Of String, String())
Private Shared DeletableNodesXpath As New List(Of String)()
Shared Sub New()
Whitelist = New Dictionary(Of String, String())() From { _
{"a", New () {"href"}}, _
{"strong", Nothing}, _
{"em", Nothing}, _
{"blockquote", Nothing}, _
{"b", Nothing}, _
{"p", Nothing}, _
{"ul", Nothing}, _
{"ol", Nothing}, _
{"li", Nothing}, _
{"div", New () {"align"}}, _
{"strike", Nothing}, _
{"u", Nothing}, _
{"sub", Nothing}, _
{"sup", Nothing}, _
{"table", Nothing}, _
{"tr", Nothing}, _
{"td", Nothing}, _
{"th", Nothing} _
}
End Sub
Public Shared Function Sanitize(input As String) As String
If input.Trim().Length < 1 Then
Return String.Empty
End If
Dim htmlDocument = New HtmlDocument()
htmlDocument.LoadHtml(input)
SanitizeNode(htmlDocument.DocumentNode)
Dim xPath As String = HtmlSanitizer.CreateXPath()
Return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath)
End Function
Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
SanitizeNode(parentNode.ChildNodes(i))
Next
End Sub
Private Shared Sub SanitizeNode(node As HtmlNode)
If node.NodeType = HtmlNodeType.Element Then
If Not Whitelist.ContainsKey(node.Name) Then
If Not DeletableNodesXpath.Contains(node.Name) Then
''DeletableNodesXpath.Add(node.Name.Replace("?",""));
node.Name = "removeableNode"
DeletableNodesXpath.Add(node.Name)
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
Return
End If
If node.HasAttributes Then
For i As Integer = node.Attributes.Count - 1 To 0 Step -1
Dim currentAttribute As HtmlAttribute = node.Attributes(i)
Dim allowedAttributes As String() = Whitelist(node.Name)
If allowedAttributes IsNot Nothing Then
If Not allowedAttributes.Contains(currentAttribute.Name) Then
node.Attributes.Remove(currentAttribute)
End If
Else
node.Attributes.Remove(currentAttribute)
End If
Next
End If
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
End Sub
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
For Each node As HtmlNode In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
Private Shared Function CreateXPath() As String
Dim _xPath As String = String.Empty
For i As Integer = 0 To DeletableNodesXpath.Count - 1
If i IsNot DeletableNodesXpath.Count - 1 Then
_xPath += String.Format("//{0}|", DeletableNodesXpath(i).ToString())
Else
_xPath += String.Format("//{0}", DeletableNodesXpath(i).ToString())
End If
Next
Return _xPath
End Function
End Class
Parece que asume que ForEach atraviesa el documento de principio a fin ... si desea asegurarse de hacerlo, use un bucle regular para. Ni siquiera puede estar seguro de que los nodos se están recogiendo en el orden que espera con el selector xpath, pero es posible que tenga razón en esta ocasión ...
Saludos, Brunis
Puede utilizar el siguiente código.
public string RemoveHTMLTags(string source)
{
string expn = "<.*?>";
return Regex.Replace(source, expn, string.Empty);
}