Khades (8/31/2012)
The character that was giving me problems is â (0xE2)
Here's a revised version of the function I posted above which in addition to checking for normal ASCII characters will remove any other extended characters either in their native form (â) or encoded form (&#x). There are some examples to run at the bottom of the code.
CREATE FUNCTION [dbo].[svfHTMLEncodeXML]
(
@OriginalText VARCHAR(8000)
)
RETURNS VARCHAR(8000)
BEGIN
DECLARE
@CleanedText VARCHAR(8000)
,@pos INT
,@chunk VARCHAR(10)
SELECT @CleanedText = COALESCE(@CleanedText + '','')
+ CAST(CleanText.CharVal AS VARCHAR(8000))
FROM
(
SELECT
Result.CharVal
FROM
(
SELECT
N AS RowNum
,(CASE
WHEN ASCII(SUBSTRING(@OriginalText,t.N,1)) BETWEEN 32 AND 127
THEN CAST(SUBSTRING(@OriginalText,t.N,1) AS VARCHAR(50))
WHEN ASCII(SUBSTRING(@OriginalText,t.N,1)) < 32
THEN NULL
WHEN ASCII(SUBSTRING(@OriginalText,t.N,1)) >= 127
THEN NULL
END)
AS CharVal
FROM
dbo.Tally AS t
WHERE
t.N <= LEN(@OriginalText)
) AS Result
WHERE
Result.CharVal IS NOT NULL
) CleanText
SET @pos = PATINDEX('%[&#x]%',@CleanedText)
WHILE @pos <> 0
BEGIN
SET @chunk = SUBSTRING(@CleanedText,@pos,6)
SET @CleanedText = REPLACE(@CleanedText,@chunk,'')
SET @pos = PATINDEX('%[&#x]%',@CleanedText)
SET @chunk = NULL
END
SET @pos = PATINDEX('%[0x]%',@CleanedText)
WHILE @pos <> 0
BEGIN
SET @chunk = SUBSTRING(@CleanedText,@pos,6)
SET @CleanedText = REPLACE(@CleanedText,@chunk,'')
SET @pos = PATINDEX('%[0x]%',@CleanedText)
SET @chunk = NULL
END
/* This optional section removes tabs and excess spaces */
SET @CleanedText = REPLACE(@CleanedText,CHAR(9),' ')-- convert tabs to spaces
SET @CleanedText =
REPLACE(
REPLACE(
REPLACE(
@CleanedText
,' ',' '+CHAR(7))
,CHAR(7)+' ','')
,CHAR(7),'')
SET @CleanedText = REPLACE(@CleanedText,'> <','><')
RETURN @CleanedText
/*
SELECT dbo.svfHTMLEncodeXML('O&xxxx;Brien') -- sub '#x0D' for 'xxxx' to test
SELECT dbo.svfHTMLEncodeXML('OÇBrien')
SELECT dbo.svfHTMLEncodeXML('O¢Brien')
SELECT dbo.svfHTMLEncodeXML('O&xxxx;Brien') -- sub '#x0F' for 'xxxx' to test
SELECT dbo.svfHTMLEncodeXML('OâBrien')
*/
/* --this is text with an invisible line break character in it
SELECT dbo.svfHTMLEncodeXML('O
Brien')
*/
/* -- notice the â (0xE2) in the 2nd position which the function will remove
SELECT CONVERT(XML,dbo.svfHTMLEncodeXML(
'<âRoot type="demographics">
<PersonalData>
<PersonName type="primary">
<GivenName>John</GivenName>
<MiddleName>A</MiddleName>
<FamilyName>Doe</FamilyName>
</PersonName>
</PersonalData>
</Root>'))
*/
END