July 23, 2013 at 9:27 am
Hi,
I'm importing some XML files into SQL which have an XSD schema which I am validating against. The "issue" I've found is that some of the data in the files is contained within CDATA tags but the corresponding XML element is defined as an xsd:token (from W3Schools - "The token data type also contains characters, but the XML processor will remove line feeds, carriage returns, tabs, leading and trailing spaces, and multiple spaces."). What I noticed was that when I ran a simple XQuery in SQL against the Typed XML variable it was applying the xsd:token rules of removing multiple spaces to the CDATA values. Before I go back to the supplier of the file I just wanted to double check what the correct result should be. See below snippet of code...
CREATEXML SCHEMA COLLECTION dbo.MyTestSchema
AS
N'
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<xsd:element name="ROOT">
<xsd:complexType>
<xsd:complexContent>
<xsd:restriction base="xsd:anyType">
<xsd:sequence>
<xsd:element ref="Test" minOccurs="0" maxOccurs="unbounded"/>
</xsd:sequence>
</xsd:restriction>
</xsd:complexContent>
</xsd:complexType>
</xsd:element>
<xsd:element name="Test" type="TestType"/>
<xsd:simpleType name="TestType">
<xsd:restriction base="xsd:token">
</xsd:restriction>
</xsd:simpleType>
</xsd:schema>'
GO
DECLARE@XMLData varchar(MAX) =
'<ROOT>
<Test><![CDATA[0spaces]]></Test>
<Test><![CDATA[1 space]]></Test>
<Test><![CDATA[2 spaces]]></Test>
<Test><![CDATA[3 spaces]]></Test>
</ROOT>'
DECLARE@XML xml = @XMLData
DECLARE@MyTestXML xml(CONTENT dbo.MyTestSchema) = @XMLData
;WITHWithoutSchema AS
(
SELECT
[Test] = NULLIF(T2.n.value('.', 'varchar(10)'), '')
FROM@XML.nodes('/ROOT') AS T1(n)
CROSSAPPLY T1.n.nodes('Test') AS T2(n)
),
WithSchema AS
(
SELECT
[Test] = NULLIF(T2.n.value('.', 'varchar(10)'), '')
FROM@MyTestXML.nodes('/ROOT') AS T1(n)
CROSSAPPLY T1.n.nodes('Test') AS T2(n)
)
SELECT[WithoutSchema] = N.Test, [WithSchema] = Y.Test
FROMWithoutSchema N
INNERJOIN WithSchema Y
ONREPLACE(N.Test, ' ', '') = REPLACE(Y.Test, ' ', '')
GO
DROPXML SCHEMA COLLECTION dbo.MyTestSchema
GO
...the results are...
WithoutSchema WithSchema
------------- ----------
0spaces 0spaces
1 space 1 space
2 spaces 2 spaces
3 spaces 3 spaces
...as you can see, using a non-typed xml variable preserves the spaces in the CDATA text, but using the typed variable (which uses xsd:token) is stripping them out. Should that be happening, I thought xsd was only applied to non-CDATA values? The spaces have a meaning in the data we are loading so if that is correct behaviour I need to raise with the supplier.
Many Thanks!
July 23, 2013 at 10:22 am
The parsing procedure I generally use (Parse XML to Table[/url]) does NOT preserve the spaces, so I made a slightly modified version that gives you the choice. I really don't know if that is normal XML behavior, but this should let you get around that issue.
CREATE PROCEDURE [dbo].[ParseXMLtoTableWithCDATA]
@strXML AS XML
,@rootnode NVARCHAR(255)
,@preserveCDATA BIT
AS
BEGIN
/*
EXEC dbo.ParseXMLtoTableWithCDATA
N'<ROOT>
<Test><![CDATA[0spaces]]></Test>
<Test><![CDATA[1 space]]></Test>
<Test><![CDATA[2 spaces]]></Test>
<Test><![CDATA[3 spaces]]></Test>
</ROOT>'
,'ROOT'
,1
EXEC dbo.ParseXMLtoTableWithCDATA
N'<ROOT>
<Test><![CDATA[0spaces]]></Test>
<Test><![CDATA[1 space]]></Test>
<Test><![CDATA[2 spaces]]></Test>
<Test><![CDATA[3 spaces]]></Test>
</ROOT>'
,'ROOT'
,0
EXEC dbo.ParseXMLtoTableWithCDATA
'<items>
<item id="0001" type="Donut">
<name><![CDATA[Cake]]></name>
<ppu><![CDATA[0.55]]></ppu>
<batter id="1001"><![CDATA[Regular]]></batter>
<batter id="1002"><![CDATA[Chocolate]]></batter>
<batter id="1003"><![CDATA[Blueberry]]></batter>
<topping id="5001"><![CDATA[None]]></topping>
<topping id="5002"><![CDATA[Glazed Donut]]></topping>
<topping id="5005"><![CDATA[Sugar Cookie]]></topping>
<topping id="5006"><![CDATA[Sprinkles]]></topping>
<topping id="5003"><![CDATA[Chocolate Cup Cake]]></topping>
<topping id="5004"><![CDATA[Maple Syrup]]></topping>
</item>
</items>'
,'items'
,0
EXEC dbo.ParseXMLtoTableWithCDATA
'<items>
<item id="0001" type="Donut">
<name><![CDATA[Cake]]></name>
<ppu><![CDATA[0.55]]></ppu>
<batter id="1001"><![CDATA[Regular]]></batter>
<batter id="1002"><![CDATA[Chocolate]]></batter>
<batter id="1003"><![CDATA[Blueberry]]></batter>
<topping id="5001"><![CDATA[None]]></topping>
<topping id="5002"><![CDATA[Glazed Donut]]></topping>
<topping id="5005"><![CDATA[Sugar Cookie]]></topping>
<topping id="5006"><![CDATA[Sprinkles]]></topping>
<topping id="5003"><![CDATA[Chocolate Cup Cake]]></topping>
<topping id="5004"><![CDATA[Maple Syrup]]></topping>
</item>
</items>'
,'items'
,1
*/
SET NOCOUNT ON
DECLARE
@strText AS NVARCHAR(MAX)
,@idoc INT
,@id INT
,@parentid INT
IF OBJECT_ID('tempdb..#ChildList') IS NOT NULL
DROP TABLE #ChildList
CREATE TABLE #ChildList (
[RowNum] INT IDENTITY(1,1) NOT NULL,
[parentid] INT NULL,
[id] INT NULL,
PRIMARY KEY (RowNum))
IF OBJECT_ID('tempdb..#NodeList') IS NOT NULL
DROP TABLE #NodeList
CREATE TABLE #NodeList (
[RowNum] INT NOT NULL,
[id] INT NULL,
[parentid] INT NULL,
[nodetype] INT NULL,
[localname] NVARCHAR(MAX) NULL,
[text] NVARCHAR(MAX) NULL,
PRIMARY KEY (RowNum))
SET @id = 1
SET @parentid = NULL
IF @preserveCDATA <> 0
SET @preserveCDATA = 1
/* Get rid of tabs and extra spaces */
SET @strText = CAST(@strXML AS NVARCHAR(MAX))
IF @preserveCDATA = 0
BEGIN
SET @strText =
REPLACE(
REPLACE(
REPLACE(
REPLACE(
@strText
,' ',' '+CHAR(7))
,CHAR(7)+' ','')
,CHAR(7),'')
,CHAR(9),' ')
END
SET @strXML = CONVERT(XML,@strText)
/* Validate the XML */
EXEC sp_xml_preparedocument @idoc OUTPUT, @strXML
/* Parse the XML data */
;WITH cteChildren (parentid, id)
AS (
SELECT
CAST(p1.parentid AS INT) AS parentid
,CAST(p1.id AS INT) AS id
FROM
OPENXML (@idoc,@rootnode,2) AS p1
UNION ALL
SELECT
CAST(p2.parentid AS INT) AS parentid
,CAST(p2.id AS INT) AS id
FROM
OPENXML (@idoc,@rootnode,2) AS p2
INNER JOIN
cteChildren AS cte
ON cte.id = p2.ParentID
WHERE
p2.parentid = @parentid
)
INSERT INTO #ChildList
SELECT
parentid
,id
FROM cteChildren
INSERT INTO #NodeList
SELECT
#ChildList.RowNum
,xmllist.id
,xmllist.parentid
,xmllist.nodetype
,xmllist.localname
,CAST(xmllist.[text] AS NVARCHAR(MAX)) AS [text]
FROM #ChildList
INNER JOIN
OPENXML (@idoc,@rootnode,2) AS xmllist
ON #ChildList.id = xmllist.id
WHERE
#ChildList.RowNum > 0
/* Display the results */
IF OBJECT_ID('dbo.XML_Nodes') IS NOT NULL
DROP TABLE dbo.XML_Nodes
;WITH RecursiveNodes(RowNum,id,parentid,nodepath,localname,[text],nodetype)
AS (
SELECT
#NodeList.RowNum
,#NodeList.id
,#NodeList.parentid
,CAST('/' + REPLACE(REPLACE(REPLACE(REPLACE(#NodeList.localname,'&',''),'?',''),' ',''),'.','') AS NVARCHAR(255)) AS nodepath
,#NodeList.localname
,CAST(#NodeList.[text] AS NVARCHAR(MAX)) AS [text]
,0 AS nodetype
FROM #ChildList
INNER JOIN
#NodeList
ON #ChildList.id = #NodeList.id
WHERE
#NodeList.parentid IS NULL
AND #ChildList.RowNum > 0
AND #NodeList.RowNum > 0
UNION ALL
SELECT
n.RowNum
,n.id
,n.parentid
,CAST(r.nodepath + '/'+ REPLACE(REPLACE(REPLACE(REPLACE(n.localname,'&',''),'?',''),' ',''),'.','') AS NVARCHAR(255)) AS nodepath
,n.localname
,n.[text]
,n.nodetype
FROM #NodeList AS n
INNER JOIN
RecursiveNodes AS r
ON n.parentid = r.id
WHERE
n.RowNum > 0
AND r.RowNum > 0
AND n.parentid >= 0
)
SELECT
ROW_NUMBER() OVER (ORDER BY Result.RowNum) AS RowNum
,Result.id
,Result.parentid
,Result.nodepath
,Result.nodetype
,Result.nodename
,Result.property
,Result.value
,Result.nodecontents
INTO dbo.XML_Nodes
FROM
(
SELECT
rn.RowNum
,rn.id
,rn.parentid
,rn.nodepath
,(CASE
WHEN rn.nodetype = 0 THEN 'Root'
WHEN rn.nodetype = 1 THEN 'Node'
WHEN rn.nodetype = 2 THEN 'Property'
ELSE 'Data'
END) AS nodetype
,(CASE
WHEN rn.nodetype = 0 THEN rn.localname
WHEN rn.nodetype = 1 THEN rn.localname
WHEN rn.nodetype = 2 THEN (SELECT TOP(1) localname FROM RecursiveNodes WHERE id = rn.parentid)
ELSE NULL
END) AS nodename
,(CASE
WHEN rn.nodetype = 2 THEN rn.localname
ELSE NULL
END) AS property
,(CASE
WHEN rn.nodetype = 2 THEN (SELECT TOP(1) [text] FROM RecursiveNodes WHERE parentid = rn.id)
ELSE NULL
END) AS value
,(CASE
WHEN rn.nodetype = 1 THEN (SELECT TOP(1) [text] FROM RecursiveNodes WHERE parentid = rn.id)
WHEN rn.nodetype = 2 THEN (SELECT TOP(1) [text] FROM RecursiveNodes WHERE parentid = rn.parentid and [text] is not null)
ELSE NULL
END) AS nodecontents
FROM
RecursiveNodes AS rn
WHERE
rn.localname <> '#text'
) AS Result
WHERE
Result.id >= 0
AND (Result.id = 0
OR property IS NOT NULL
OR value IS NOT NULL
OR nodecontents IS NOT NULL)
SELECT
RowNum
,id
,parentid
,nodepath
,nodetype
,nodename
,property
,value
,nodecontents
FROM
dbo.XML_Nodes
END
Viewing 2 posts - 1 through 2 (of 2 total)
You must be logged in to reply to this topic. Login to reply
This website stores cookies on your computer.
These cookies are used to improve your website experience and provide more personalized services to you, both on this website and through other media.
To find out more about the cookies we use, see our Privacy Policy