• Khades (8/31/2012)


    The character that was giving me problems is â (0xE2)

    Here's a revised version of the function I posted above which in addition to checking for normal ASCII characters will remove any other extended characters either in their native form (â) or encoded form (&#x). There are some examples to run at the bottom of the code.

    CREATE FUNCTION [dbo].[svfHTMLEncodeXML]

    (

    @OriginalText VARCHAR(8000)

    )

    RETURNS VARCHAR(8000)

    BEGIN

    DECLARE

    @CleanedText VARCHAR(8000)

    ,@pos INT

    ,@chunk VARCHAR(10)

    SELECT @CleanedText = COALESCE(@CleanedText + '','')

    + CAST(CleanText.CharVal AS VARCHAR(8000))

    FROM

    (

    SELECT

    Result.CharVal

    FROM

    (

    SELECT

    N AS RowNum

    ,(CASE

    WHEN ASCII(SUBSTRING(@OriginalText,t.N,1)) BETWEEN 32 AND 127

    THEN CAST(SUBSTRING(@OriginalText,t.N,1) AS VARCHAR(50))

    WHEN ASCII(SUBSTRING(@OriginalText,t.N,1)) < 32

    THEN NULL

    WHEN ASCII(SUBSTRING(@OriginalText,t.N,1)) >= 127

    THEN NULL

    END)

    AS CharVal

    FROM

    dbo.Tally AS t

    WHERE

    t.N <= LEN(@OriginalText)

    ) AS Result

    WHERE

    Result.CharVal IS NOT NULL

    ) CleanText

    SET @pos = PATINDEX('%[&#x]%',@CleanedText)

    WHILE @pos <> 0

    BEGIN

    SET @chunk = SUBSTRING(@CleanedText,@pos,6)

    SET @CleanedText = REPLACE(@CleanedText,@chunk,'')

    SET @pos = PATINDEX('%[&#x]%',@CleanedText)

    SET @chunk = NULL

    END

    SET @pos = PATINDEX('%[0x]%',@CleanedText)

    WHILE @pos <> 0

    BEGIN

    SET @chunk = SUBSTRING(@CleanedText,@pos,6)

    SET @CleanedText = REPLACE(@CleanedText,@chunk,'')

    SET @pos = PATINDEX('%[0x]%',@CleanedText)

    SET @chunk = NULL

    END

    /* This optional section removes tabs and excess spaces */

    SET @CleanedText = REPLACE(@CleanedText,CHAR(9),' ')-- convert tabs to spaces

    SET @CleanedText =

    REPLACE(

    REPLACE(

    REPLACE(

    @CleanedText

    ,' ',' '+CHAR(7))

    ,CHAR(7)+' ','')

    ,CHAR(7),'')

    SET @CleanedText = REPLACE(@CleanedText,'> <','><')

    RETURN @CleanedText

    /*

    SELECT dbo.svfHTMLEncodeXML('O&xxxx;Brien') -- sub '#x0D' for 'xxxx' to test

    SELECT dbo.svfHTMLEncodeXML('OÇBrien')

    SELECT dbo.svfHTMLEncodeXML('O¢Brien')

    SELECT dbo.svfHTMLEncodeXML('O&xxxx;Brien') -- sub '#x0F' for 'xxxx' to test

    SELECT dbo.svfHTMLEncodeXML('OâBrien')

    */

    /* --this is text with an invisible line break character in it

    SELECT dbo.svfHTMLEncodeXML('O

    Brien')

    */

    /* -- notice the â (0xE2) in the 2nd position which the function will remove

    SELECT CONVERT(XML,dbo.svfHTMLEncodeXML(

    '<âRoot type="demographics">

    <PersonalData>

    <PersonName type="primary">

    <GivenName>John</GivenName>

    <MiddleName>A</MiddleName>

    <FamilyName>Doe</FamilyName>

    </PersonName>

    </PersonalData>

    </Root>'))

    */

    END