sqlserver - ¿La mejor manera de quitar las etiquetas html de una cadena en el servidor sql?
substring string sql server (7)
¿Qué le parece usar XQuery con un trazador de líneas uno:
select @xml.query(''for $x in //. return ($x)//text()'')
Esto recorre todos los elementos y devuelve el texto () solamente.
Para evitar texto entre elementos concatenados sin espacios, use:
SELECT @xml.query(''for $x in //. return concat((($x)//text())[1]," ")'')
Estos son excelentes para cuando quiere construir frases de búsqueda, tira de HTML, etc.
Solo tenga en cuenta que esto devuelve tipo xml, por lo tanto, CAST o COVERT para enviar el texto donde corresponda. La versión xml de este tipo de datos es inútil, ya que no es un XML bien formado.
Tengo datos en SQL Server 2005 que contienen etiquetas html y me gustaría quitar todo eso, dejando solo el texto entre las etiquetas. Lo ideal es también reemplazar cosas como <
con <
, etc.
¿Hay una manera fácil de hacer esto o alguien ya tiene un código t-sql de muestra?
No tengo la capacidad de agregar procesos almacenados extendidos y similares, por lo que preferiría un enfoque de t-sql puro (preferiblemente uno compatible con sql 2000).
Solo quiero recuperar los datos con html eliminado, no actualizarlos, por lo que idealmente se escribiría como una función definida por el usuario, para facilitar su reutilización.
Entonces, por ejemplo, convertir esto:
<B>Some useful text</B>
<A onclick="return openInfo(this)"
href="http://there.com/3ce984e88d0531bac5349"
target=globalhelp>
<IMG title="Source Description" height=15 alt="Source Description"
src="/ri/new_info.gif" width=15 align=top border=0>
</A>> <b>more text</b></TD></TR>
a esto:
Some useful text > more text
Aquí hay una versión actualizada de esta función que incorpora la respuesta de RedFilter (original de Pinal) con las adiciones de LazyCoders y las correcciones de errores de tipo goodeye Y mi propia adición para manejar etiquetas en línea <STYLE>
dentro del HTML.
ALTER FUNCTION [dbo].[udf_StripHTML]
(
@HTMLText varchar(MAX)
)
RETURNS varchar(MAX)
AS
BEGIN
DECLARE @Start int
DECLARE @End int
DECLARE @Length int
-- Replace the HTML entity & with the ''&'' character (this needs to be done first, as
-- ''&'' might be double encoded as ''&amp;'')
SET @Start = CHARINDEX(''&'', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ''&'')
SET @Start = CHARINDEX(''&'', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity < with the ''<'' character
SET @Start = CHARINDEX(''<'', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ''<'')
SET @Start = CHARINDEX(''<'', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity > with the ''>'' character
SET @Start = CHARINDEX(''>'', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ''>'')
SET @Start = CHARINDEX(''>'', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity & with the ''&'' character
SET @Start = CHARINDEX(''&amp;'', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ''&'')
SET @Start = CHARINDEX(''&amp;'', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity with the '' '' character
SET @Start = CHARINDEX('' '', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '' '')
SET @Start = CHARINDEX('' '', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
END
-- Replace any <br> tags with a newline
SET @Start = CHARINDEX(''<br>'', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX(''<br>'', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END
-- Replace any <br/> tags with a newline
SET @Start = CHARINDEX(''<br/>'', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX(''<br/>'', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END
-- Replace any <br /> tags with a newline
SET @Start = CHARINDEX(''<br />'', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX(''<br />'', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <STYLE> tags
SET @Start = CHARINDEX(''<STYLE'', @HTMLText)
SET @End = CHARINDEX(''</STYLE>'', @HTMLText, CHARINDEX(''<'', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '''')
SET @Start = CHARINDEX(''<STYLE'', @HTMLText)
SET @End = CHARINDEX(''</STYLE>'', @HTMLText, CHARINDEX(''</STYLE>'', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <whatever> tags
SET @Start = CHARINDEX(''<'', @HTMLText)
SET @End = CHARINDEX(''>'', @HTMLText, CHARINDEX(''<'', @HTMLText))
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '''')
SET @Start = CHARINDEX(''<'', @HTMLText)
SET @End = CHARINDEX(''>'', @HTMLText, CHARINDEX(''<'', @HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
Derivado de @Goner Doug answer, con algunas cosas actualizadas:
- usando REEMPLAZAR cuando sea posible
- conversión de entidades predefinidas como é
(Elegí las que necesitaba :-)
- Alguna conversión de etiquetas de lista <ul> and <li>
ALTER FUNCTION [dbo].[udf_StripHTML]
--by Patrick Honorez --- www.idevlop.com
--inspired by http://.com/questions/457701/best-way-to-strip-html-tags-from-a-string-in-sql-server/39253602#39253602
(
@HTMLText varchar(MAX)
)
RETURNS varchar(MAX)
AS
BEGIN
DECLARE @Start int
DECLARE @End int
DECLARE @Length int
set @HTMLText = replace(@htmlText, ''<br>'',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, ''<br/>'',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, ''<br />'',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, ''<li>'',''- '')
set @HTMLText = replace(@htmlText, ''</li>'',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, ''’'' collate Latin1_General_CS_AS, '''''''' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''"'' collate Latin1_General_CS_AS, ''"'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''&'' collate Latin1_General_CS_AS, ''&'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''€'' collate Latin1_General_CS_AS, ''€'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''<'' collate Latin1_General_CS_AS, ''<'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''>'' collate Latin1_General_CS_AS, ''>'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''œ'' collate Latin1_General_CS_AS, ''oe'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '' '' collate Latin1_General_CS_AS, '' '' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''©'' collate Latin1_General_CS_AS, ''©'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''«'' collate Latin1_General_CS_AS, ''«'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''®'' collate Latin1_General_CS_AS, ''®'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''±'' collate Latin1_General_CS_AS, ''±'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''²'' collate Latin1_General_CS_AS, ''²'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''³'' collate Latin1_General_CS_AS, ''³'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''µ'' collate Latin1_General_CS_AS, ''µ'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''·'' collate Latin1_General_CS_AS, ''·'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''º'' collate Latin1_General_CS_AS, ''º'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''»'' collate Latin1_General_CS_AS, ''»'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''¼'' collate Latin1_General_CS_AS, ''¼'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''½'' collate Latin1_General_CS_AS, ''½'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''¾'' collate Latin1_General_CS_AS, ''¾'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''&Aelig'' collate Latin1_General_CS_AS, ''Æ'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''Ç'' collate Latin1_General_CS_AS, ''Ç'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''È'' collate Latin1_General_CS_AS, ''È'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''É'' collate Latin1_General_CS_AS, ''É'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''Ê'' collate Latin1_General_CS_AS, ''Ê'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''Ö'' collate Latin1_General_CS_AS, ''Ö'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''à'' collate Latin1_General_CS_AS, ''à'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''â'' collate Latin1_General_CS_AS, ''â'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ä'' collate Latin1_General_CS_AS, ''ä'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''æ'' collate Latin1_General_CS_AS, ''æ'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ç'' collate Latin1_General_CS_AS, ''ç'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''è'' collate Latin1_General_CS_AS, ''è'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''é'' collate Latin1_General_CS_AS, ''é'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ê'' collate Latin1_General_CS_AS, ''ê'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ë'' collate Latin1_General_CS_AS, ''ë'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''î'' collate Latin1_General_CS_AS, ''î'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ô'' collate Latin1_General_CS_AS, ''ô'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ö'' collate Latin1_General_CS_AS, ''ö'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''÷'' collate Latin1_General_CS_AS, ''÷'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ø'' collate Latin1_General_CS_AS, ''ø'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ù'' collate Latin1_General_CS_AS, ''ù'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ú'' collate Latin1_General_CS_AS, ''ú'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''û'' collate Latin1_General_CS_AS, ''û'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''ü'' collate Latin1_General_CS_AS, ''ü'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''"'' collate Latin1_General_CS_AS, ''"'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''&'' collate Latin1_General_CS_AS, ''&'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''‹'' collate Latin1_General_CS_AS, ''<'' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ''›'' collate Latin1_General_CS_AS, ''>'' collate Latin1_General_CS_AS)
-- Remove anything between <STYLE> tags
SET @Start = CHARINDEX(''<STYLE'', @HTMLText)
SET @End = CHARINDEX(''</STYLE>'', @HTMLText, CHARINDEX(''<'', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '''')
SET @Start = CHARINDEX(''<STYLE'', @HTMLText)
SET @End = CHARINDEX(''</STYLE>'', @HTMLText, CHARINDEX(''</STYLE>'', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <whatever> tags
SET @Start = CHARINDEX(''<'', @HTMLText)
SET @End = CHARINDEX(''>'', @HTMLText, CHARINDEX(''<'', @HTMLText))
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '''')
SET @Start = CHARINDEX(''<'', @HTMLText)
SET @End = CHARINDEX(''>'', @HTMLText, CHARINDEX(''<'', @HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
Esta no es una solución completamente nueva, sino una corrección para la solución del prestador de servicios :
--note comments to see the corrections
CREATE FUNCTION [dbo].[StripHTML] (@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
--DECLARE @TempStr varchar(255) (this is not used)
SET @Start = CHARINDEX(''<'',@HTMLText)
SET @End = CHARINDEX(''>'',@HTMLText,CHARINDEX(''<'',@HTMLText))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0 AND @End > 0 AND @Length > 0
BEGIN
IF (UPPER(SUBSTRING(@HTMLText, @Start, 4)) <> ''<BR>'') AND (UPPER(SUBSTRING(@HTMLText, @Start, 5)) <> ''</BR>'')
begin
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'''')
end
-- this ELSE and SET is important
ELSE
SET @Length = 0;
-- minus @Length here below is important
SET @Start = CHARINDEX(''<'',@HTMLText, @End-@Length)
SET @End = CHARINDEX(''>'',@HTMLText,CHARINDEX(''<'',@HTMLText, @Start))
-- instead of -1 it should be +1
SET @Length = (@End - @Start) + 1
END
RETURN RTRIM(LTRIM(@HTMLText))
END
Hay un UDF que hará lo que se describe aquí:
Función definida por el usuario para eliminar HTML
CREATE FUNCTION [dbo].[udf_StripHTML] (@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX) AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
SET @Start = CHARINDEX(''<'',@HTMLText)
SET @End = CHARINDEX(''>'',@HTMLText,CHARINDEX(''<'',@HTMLText))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0 AND @End > 0 AND @Length > 0
BEGIN
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'''')
SET @Start = CHARINDEX(''<'',@HTMLText)
SET @End = CHARINDEX(''>'',@HTMLText,CHARINDEX(''<'',@HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
GO
Editar: tenga en cuenta que esto es para SQL Server 2005, pero si cambia la palabra clave MAX a algo así como 4000, también funcionará en SQL Server 2000.
Prueba esto. Es una versión modificada de la publicada por RedFilter ... este SQL elimina todas las etiquetas excepto BR, B y P con los atributos que se acompañan:
CREATE FUNCTION [dbo].[StripHtml] (@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
DECLARE @TempStr varchar(255)
SET @Start = CHARINDEX(''<'',@HTMLText)
SET @End = CHARINDEX(''>'',@HTMLText,CHARINDEX(''<'',@HTMLText))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0 AND @End > 0 AND @Length > 0
BEGIN
IF (UPPER(SUBSTRING(@HTMLText, @Start, 3)) <> ''<BR'') AND (UPPER(SUBSTRING(@HTMLText, @Start, 2)) <> ''<P'') AND (UPPER(SUBSTRING(@HTMLText, @Start, 2)) <> ''<B'') AND (UPPER(SUBSTRING(@HTMLText, @Start, 3)) <> ''</B'')
BEGIN
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'''')
END
SET @Start = CHARINDEX(''<'',@HTMLText, @End)
SET @End = CHARINDEX(''>'',@HTMLText,CHARINDEX(''<'',@HTMLText, @Start))
SET @Length = (@End - @Start) - 1
END
RETURN RTRIM(LTRIM(@HTMLText))
END
Si su HTML está bien formado, creo que esta es una mejor solución:
create function dbo.StripHTML( @text varchar(max) ) returns varchar(max) as
begin
declare @textXML xml
declare @result varchar(max)
set @textXML = REPLACE( @text, ''&'', '''' );
with doc(contents) as
(
select chunks.chunk.query(''.'') from @textXML.nodes(''/'') as chunks(chunk)
)
select @result = contents.value(''.'', ''varchar(max)'') from doc
return @result
end
go
select dbo.StripHTML(''This <i>is</i> an <b>html</b> test'')