Sequence and Pivot based on pattern or islands

Question

Post reply

Sequence and Pivot based on pattern or islands

richardm 53436

SSC Rookie

Points: 36
More actions
March 10, 2016 at 4:15 am

#325615

Hi all
Apologies for the subject not sure how best to describe what I am hoping to achieve.
I have used ROW_NUMBER() OVER PARTITION on numerous occasions to sequence by a date field and was thinking that would be the starting block for the following:
I have created test data as an example and was hoping somebody could find a solution to both parts even if you don't need to use part1 to get to part2 (which is what I am really after).
Picture a Sales Order table at line level, where most items will generally be a single line (these I am not interested in and can be excluded), but there can be kits with a header and any number of components and the only thing linking them is that the components will directly follow their header when added to the Sales Order.
The Part field will denote the Header as H, Component as C and any normal item as nothing i.e. ''
So if we do WHERE Part IN ('H','C') we are left with a pattern / island of H's and C's where each H would be the boundary of a kit, like the following example:
I was thinking we would need to partition by Order, then the Part Islands to sequence the number of kits on each Order, ideally in order of the table ROW_NUMBER() but failing that the OrderLine.
Ideally though I would like to pivot all components to their actual header as follows:
Are the above 2 examples possible?
Below is the basic code to build the data
CREATE TABLE dbo.Table1
(
Order int,OrderLine int,ProductCode int,Part Char(1)
);
INSERT INTO Table1
VALUES
(1, 1,1000,'H'),(1, 2,98,'C'),(1, 3,99,'C'),(1, 7,2000,'H'),(1,8,70,'C'),
(1,9,71,'C'),(1,10,72,'C'),(1,15,3000,'H'),(1,16,50,'C'),(4,3,9000,'H'),
(4,4,10,'C'),(8,1,1000,'H'),(8,2,98,'C'),(8,3,99,'C'),(8,4,9000,'H'),
(8,5,10,'C'),(9,1,6000,'')
Thanks in advance

Viewing 10 posts - 1 through 10 (of 10 total)

You must be logged in to reply to this topic. Login to reply

J Livingston SQL SSC Guru Points: 51550 More actions · Answer 1

one idea....???

WITH cte as (

SELECT OrderNo,

OrderLine,

CASE WHEN part = 'H' THEN productcode ELSE NULL END as Headerproductcode,

CASE WHEN part = 'C' THEN productcode ELSE NULL END as Componentproductcode,

ROW_NUMBER() OVER(ORDER BY orderno, orderline) as rn

FROM table1

),

-- ref http://sqlmag.com/t-sql/last-non-null-puzzle for following

cte2 as (

SELECT OrderNo,

OrderLine,

CAST(SUBSTRING(MAX(CAST(rn AS BINARY(4)) + CAST(Headerproductcode AS BINARY(4))) OVER(ORDER BY rn ROWS UNBOUNDED PRECEDING), 5, 4) AS INT) AS Headerproductcode,

Componentproductcode,

rn

FROM cte

)

SELECT OrderNo,

OrderLine,

Headerproductcode ,

Componentproductcode

FROM cte2

WHERE(NOT(Componentproductcode IS NULL))

ORDER BY rn

________________________________________________________________
you can lead a user to data....but you cannot make them think
and remember....every day is a school day

sgmunson SSC Guru Points: 110651 More actions · Answer 2

I don't have a 2012 instance handy, or I could have tried using LEAD or LAG with some GROUP BYs, but here's a query that only uses 2008 syntax that produces the right results. It doesn't look like a good performer though...

CREATE TABLE #Table1 (

[Order] int NOT NULL,

OrderLine int NOT NULL,

ProductCode int,

Part char(1),

PRIMARY KEY CLUSTERED (

[Order] ASC,

OrderLine ASC

)

);

INSERT INTO #Table1 ([Order], OrderLine, ProductCode, Part)

VALUES (1, 1,1000,'H'),

(1, 2,98,'C'),

(1, 3,99,'C'),

(1, 7,2000,'H'),

(1,8,70,'C'),

(1,9,71,'C'),

(1,10,72,'C'),

(1,15,3000,'H'),

(1,16,50,'C'),

(4,3,9000,'H'),

(4,4,10,'C'),

(8,1,1000,'H'),

(8,2,98,'C'),

(8,3,99,'C'),

(8,4,9000,'H'),

(8,5,10,'C'),

(9,1,6000,'');

CREATE NONCLUSTERED INDEX IX_Table1_Part ON #Table1

(

Part ASC

);

WITH HEADERS AS (

SELECT *,

ROW_NUMBER() OVER(PARTITION BY T.[Order] ORDER BY T.OrderLine) AS ROW_NUM

FROM #Table1 AS T

WHERE T.Part = 'H'

),

COMPONENTS AS (

SELECT *

FROM #Table1 AS T

WHERE T.Part = 'C'

)

SELECT H.[Order], C.OrderLine, H.ProductCode AS HeaderProductCode, C.ProductCode AS ComponentProductCode,

ISNULL((SELECT MIN(H2.OrderLine) FROM HEADERS AS H2 WHERE H2.ROW_NUM > H.ROW_NUM), 1) AS THE_ISNULL

FROM HEADERS AS H

INNER JOIN COMPONENTS AS C

ON H.[Order] = C.[Order]

AND H.OrderLine < C.OrderLine

AND C.OrderLine < ISNULL((SELECT MIN(H2.OrderLine) FROM HEADERS AS H2 WHERE H2.[Order] = H.[Order] AND H2.ROW_NUM > H.ROW_NUM),

(SELECT MAX(C2.OrderLine) FROM COMPONENTS AS C2 WHERE C2.[Order] = H.[Order]) + 1)

ORDER BY H.[Order], C.OrderLine;

DROP TABLE #Table1;

Steve (aka sgmunson) 🙂 🙂 🙂
Rent Servers for Income (picks and shovels strategy)

J Livingston SQL SSC Guru Points: 51550 More actions · Answer 3

sgmunson (3/10/2016)
I don't have a 2012 instance handy, or I could have tried using LEAD or LAG with some GROUP BYs, but here's a query that only uses 2008 syntax that produces the right results. It doesn't look like a good performer though...
CREATE TABLE #Table1 (
[Order] int NOT NULL,
OrderLine int NOT NULL,
ProductCode int,
Part char(1),
PRIMARY KEY CLUSTERED (
[Order] ASC,
OrderLine ASC
)
);
INSERT INTO #Table1 ([Order], OrderLine, ProductCode, Part)
VALUES (1, 1,1000,'H'),
(1, 2,98,'C'),
(1, 3,99,'C'),
(1, 7,2000,'H'),
(1,8,70,'C'),
(1,9,71,'C'),
(1,10,72,'C'),
(1,15,3000,'H'),
(1,16,50,'C'),
(4,3,9000,'H'),
(4,4,10,'C'),
(8,1,1000,'H'),
(8,2,98,'C'),
(8,3,99,'C'),
(8,4,9000,'H'),
(8,5,10,'C'),
(9,1,6000,'');
CREATE NONCLUSTERED INDEX IX_Table1_Part ON #Table1
(
Part ASC
);
WITH HEADERS AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY T.[Order] ORDER BY T.OrderLine) AS ROW_NUM
FROM #Table1 AS T
WHERE T.Part = 'H'
),
COMPONENTS AS (
SELECT *
FROM #Table1 AS T
WHERE T.Part = 'C'
)
SELECT H.[Order], C.OrderLine, H.ProductCode AS HeaderProductCode, C.ProductCode AS ComponentProductCode,
ISNULL((SELECT MIN(H2.OrderLine) FROM HEADERS AS H2 WHERE H2.ROW_NUM > H.ROW_NUM), 1) AS THE_ISNULL
FROM HEADERS AS H
INNER JOIN COMPONENTS AS C
ON H.[Order] = C.[Order]
AND H.OrderLine < C.OrderLine
AND C.OrderLine < ISNULL((SELECT MIN(H2.OrderLine) FROM HEADERS AS H2 WHERE H2.[Order] = H.[Order] AND H2.ROW_NUM > H.ROW_NUM),
(SELECT MAX(C2.OrderLine) FROM COMPONENTS AS C2 WHERE C2.[Order] = H.[Order]) + 1)
ORDER BY H.[Order], C.OrderLine;
DROP TABLE #Table1;

Hi Steve......would be really interested to see your thoughts on LEAD/LAG if you have chance.

Have put together a basic test rig

use tempdb

go

--DROP TABLE #table1;

--DROP TABLE #gen1;

--DROP TABLE #gen2;

SELECT TOP 10000 IDENTITY(INT, 1, 1) AS OrderNo

into #gen1

FROM sys.all_columns ac1

CROSS JOIN sys.all_columns ac2

CROSS JOIN sys.all_columns ac3

CREATE TABLE #gen2(

Orderline INTEGER NOT NULL PRIMARY KEY

,part VARCHAR(1) NOT NULL

);

INSERT INTO #gen2(Orderline,part) VALUES

(1,'H'),(2,'C'),(3,'C'),(4,'C'),(5,'H')

,(6,'C'),(7,'H'),(8,'C'),(9,'C'),(10,'C');

SELECT #gen1.OrderNo,

#gen2.Orderline,

1 + CAST(ABS(CHECKSUM(NEWID()) % 9000) AS INT) AS ProductCode,

#gen2.part

INTO #Table1

FROM #gen1 CROSS JOIN #gen2;

DELETE FROM #Table1

WHERE (OrderNo % 5 = 0) AND (Orderline > 2)

DELETE FROM #Table1

WHERE (OrderNo % 3 = 0) AND (Orderline >6)

CREATE CLUSTERED INDEX [PK] ON [dbo].[#Table1]

([OrderNo] ASC,[Orderline] ASC);

CREATE NONCLUSTERED INDEX IX_Table1_Part ON #Table1

(Part ASC);

select count(*) from #table1

your code and mine in comparison

PRINT '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'

PRINT 'sgmunson results below ';

PRINT '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'

PRINT ''

set statistics time, io on;

WITH HEADERS AS (

SELECT *,

ROW_NUMBER() OVER(PARTITION BY T.OrderNo ORDER BY T.OrderLine) AS ROW_NUM

FROM #Table1 AS T

WHERE T.Part = 'H'

),

COMPONENTS AS (

SELECT *

FROM #Table1 AS T

WHERE T.Part = 'C'

)

SELECT H.OrderNo, C.OrderLine, H.ProductCode AS HeaderProductCode, C.ProductCode AS ComponentProductCode,

ISNULL((SELECT MIN(H2.OrderLine) FROM HEADERS AS H2 WHERE H2.ROW_NUM > H.ROW_NUM), 1) AS THE_ISNULL

FROM HEADERS AS H

INNER JOIN COMPONENTS AS C

ON H.OrderNo = C.OrderNo

AND H.OrderLine < C.OrderLine

AND C.OrderLine < ISNULL((SELECT MIN(H2.OrderLine) FROM HEADERS AS H2 WHERE H2.OrderNo = H.OrderNo AND H2.ROW_NUM > H.ROW_NUM),

(SELECT MAX(C2.OrderLine) FROM COMPONENTS AS C2 WHERE C2.OrderNo = H.OrderNo) + 1)

ORDER BY H.OrderNo, C.OrderLine;

set statistics time, io off;

PRINT ''

PRINT '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'

PRINT 'JLS results below ';

PRINT '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'

set statistics time, io on;

WITH cte as (

SELECT OrderNo,

OrderLine,

CASE WHEN part = 'H' THEN productcode ELSE NULL END as Headerproductcode,

CASE WHEN part = 'C' THEN productcode ELSE NULL END as Componentproductcode,

ROW_NUMBER() OVER(ORDER BY orderno, orderline) as rn

FROM #table1

),

-- ref http://sqlmag.com/t-sql/last-non-null-puzzle for following

cte2 as (

SELECT OrderNo,

OrderLine,

CAST(SUBSTRING(MAX(CAST(rn AS BINARY(4)) + CAST(Headerproductcode AS BINARY(4))) OVER(ORDER BY rn ROWS UNBOUNDED PRECEDING), 5, 4) AS INT) AS Headerproductcode,

Componentproductcode,

rn

FROM cte

)

SELECT OrderNo,

OrderLine,

Headerproductcode ,

Componentproductcode

FROM cte2

WHERE(NOT(Componentproductcode IS NULL))

ORDER BY rn

set statistics time, io off;

--DROP TABLE #table1;

--DROP TABLE #gen1;

--DROP TABLE #gen2;

________________________________________________________________
you can lead a user to data....but you cannot make them think
and remember....every day is a school day

drew.allen SSC Guru Points: 77014 More actions · Answer 4

sgmunson (3/10/2016)
I don't have a 2012 instance handy, or I could have tried using LEAD or LAG with some GROUP BYs, but here's a query that only uses 2008 syntax that produces the right results. It doesn't look like a good performer though...

LEAD/LAG tend not to work well in cases like this, because you have to specify a static offset and it becomes unwieldy when the offset can vary. It's easier to use FIRST_VALUE/LAST_VALUE when the offset can vary.

Drew

J. Drew Allen
Business Intelligence Analyst
Philadelphia, PA

sgmunson SSC Guru Points: 110651 More actions · Answer 5

drew.allen (3/10/2016)
sgmunson (3/10/2016)
I don't have a 2012 instance handy, or I could have tried using LEAD or LAG with some GROUP BYs, but here's a query that only uses 2008 syntax that produces the right results. It doesn't look like a good performer though...
LEAD/LAG tend not to work well in cases like this, because you have to specify a static offset and it becomes unwieldy when the offset can vary. It's easier to use FIRST_VALUE/LAST_VALUE when the offset can vary.
Drew

I don't recall, but can you specify a field value to be the offset? Even if you can't, I can think of another way to get around it, but once you do that, it might even not perform as well as what I've already coded.

Steve (aka sgmunson) 🙂 🙂 🙂
Rent Servers for Income (picks and shovels strategy)

richardm 53436 SSC Rookie Points: 36 More actions · Answer 6

Thanks for the replies!

I tried J Livingston first CTE method and while that worked perfect on the dataset provided I didn't realise the datatype of the Headerproductcode and the Componentproductcode would be fundamental to the solution, they are both actually char(30) and I just used INT for simplicity and to highlight the example.

I have briefly spent some time reading the SQLmag puzzle article and tried playing around with the CTE2 Max() line, changing the datatypes etc but could not achieve desired results.

I am not familiar with creating non clustered indexes on our application database and so I have not tried any of the other methods provided. Does the code reference these indexes or would it work without creating them but just not run as fast?

I was also hoping (only if possible) to be able to save this as a View but I am not sure that would be achievable with the CTE solution and all of the Order by clauses.

Thanks in advance and sorry for not being accurate.

J Livingston SQL SSC Guru Points: 51550 More actions · Answer 7

richardm 53436 (3/11/2016)
Thanks for the replies!
I tried J Livingston first CTE method and while that worked perfect on the dataset provided I didn't realise the datatype of the Headerproductcode and the Componentproductcode would be fundamental to the solution, they are both actually char(30) and I just used INT for simplicity and to highlight the example.
I have briefly spent some time reading the SQLmag puzzle article and tried playing around with the CTE2 Max() line, changing the datatypes etc but could not achieve desired results.
I am not familiar with creating non clustered indexes on our application database and so I have not tried any of the other methods provided. Does the code reference these indexes or would it work without creating them but just not run as fast?
I was also hoping (only if possible) to be able to save this as a View but I am not sure that would be achievable with the CTE solution and all of the Order by clauses.
Thanks in advance and sorry for not being accurate.

try this line if you have Productcode as Char(30)

CAST(SUBSTRING(MAX(CAST(rn AS BINARY(4)) + CAST(Headerproductcode AS VARBINARY(30))) OVER(ORDER BY rn ROWS UNBOUNDED PRECEDING), 5, 30) AS CHAR(30)) AS Headerproductcode,

________________________________________________________________
you can lead a user to data....but you cannot make them think
and remember....every day is a school day

drew.allen SSC Guru Points: 77014 More actions · Answer 8

sgmunson (3/10/2016)
drew.allen (3/10/2016)
sgmunson (3/10/2016)
I don't have a 2012 instance handy, or I could have tried using LEAD or LAG with some GROUP BYs, but here's a query that only uses 2008 syntax that produces the right results. It doesn't look like a good performer though...
LEAD/LAG tend not to work well in cases like this, because you have to specify a static offset and it becomes unwieldy when the offset can vary. It's easier to use FIRST_VALUE/LAST_VALUE when the offset can vary.
Drew
I don't recall, but can you specify a field value to be the offset? Even if you can't, I can think of another way to get around it, but once you do that, it might even not perform as well as what I've already coded.

No, you cannot. That is why I specifically said "static offset". For another problem, I tried using a tally table for the offset, and it wouldn't let me do it.

Drew

J. Drew Allen
Business Intelligence Analyst
Philadelphia, PA

J Livingston SQL SSC Guru Points: 51550 More actions · Answer 9

just to give you an idea...here's some sample data and a view using productcode as char

--generate a test table

use tempdb

go

--DROP TABLE test;

--DROP TABLE #gen1;

--DROP TABLE #gen2;

--DROP VIEW testview;

SELECT TOP 10000 IDENTITY(INT, 1, 1) AS OrderNo

into #gen1

FROM sys.all_columns ac1

CROSS JOIN sys.all_columns ac2

CROSS JOIN sys.all_columns ac3

CREATE TABLE #gen2(

Orderline INT NULL

,part VARCHAR(1) NULL

);

INSERT INTO #gen2(Orderline,part) VALUES

(1,'H'),(2,'C'),(3,'C'),(4,'C'),(5,'H')

,(6,'C'),(7,'H'),(8,'C'),(9,'C'),(10,'C');

CREATE TABLE test

(OrderNo INT NULL,

OrderLine INT NULL,

ProductCode CHAR(30) NULL,

Part CHAR (1) NULL

);

DECLARE @StringMaxLen int = 30

INSERT INTO test

(OrderNo

,OrderLine

,ProductCode

,Part)

SELECT #gen1.OrderNo,

#gen2.Orderline,

LEFT (REPLACE(CAST (NEWID () AS CHAR(256)),'-','') , ABS (CHECKSUM (NEWID ())) % @StringMaxLen + 1) AS ProductCode,

#gen2.part

FROM #gen1 CROSS JOIN #gen2;

-- skew no of rows

DELETE FROM test

WHERE (OrderNo % 5 = 0) AND (Orderline > 2)

DELETE FROM test

WHERE (OrderNo % 3 = 0) AND (Orderline >6)

-- see what we have got

SELECT OrderNo, OrderLine, ProductCode, Part

FROM test

ORDER BY OrderNo, OrderLine

create a view

CREATE VIEW testview

AS

WITH cte as (

SELECT OrderNo,

OrderLine,

CASE WHEN part = 'H' THEN productcode ELSE NULL END as Headerproductcode,

CASE WHEN part = 'C' THEN productcode ELSE NULL END as Componentproductcode,

ROW_NUMBER() OVER(ORDER BY orderno, orderline) as rn

FROM test

),

cte2 as (

SELECT OrderNo,

OrderLine,

CAST(SUBSTRING(MAX(CAST(rn AS BINARY(4)) + CAST(Headerproductcode AS VARBINARY(30))) OVER(ORDER BY rn ROWS UNBOUNDED PRECEDING), 5, 30) AS CHAR(30)) AS Headerproductcode,

Componentproductcode,

rn

FROM cte

)

SELECT OrderNo,

OrderLine,

Headerproductcode ,

Componentproductcode

FROM cte2

WHERE(NOT(Componentproductcode IS NULL))

GO

--see results and compare

SELECT OrderNo, OrderLine, ProductCode, Part

FROM test

--WHERE (OrderNo = 1001)

ORDER BY OrderNo, OrderLine

SELECT OrderNo, OrderLine, Headerproductcode, Componentproductcode

FROM testview

--WHERE (OrderNo = 1001)

ORDER BY OrderNo, OrderLine

________________________________________________________________
you can lead a user to data....but you cannot make them think
and remember....every day is a school day