RE: select random records but for each category

SSC Guru

Points: 1003863

March 13, 2010 at 8:58 am

Paul White (3/12/2010)
As far as I can see, the previously posted code doesn't meet the requirement to select a minimum of three rows, a maximum of 25, and 10% of the group size otherwise. It is also rather inefficient for larger group sizes, since NEWID() is evaluated for every row, and the entire set must be sorted on that value.
Here is my attempt, which selects the required rows from 100,000 random records (split into 3 groups) in 93ms on my old laptop.
-- Drop our test table if it exists
IF OBJECT_ID(N'tempdb..#Source', N'U')
IS NOT NULL
DROP TABLE #Source;
GO
-- Test table
CREATE TABLE #Source
(
row_id INTEGER IDENTITY PRIMARY KEY,
vehicle_type CHAR(1) NOT NULL,
vehicle_id INTEGER NOT NULL
);
GO
-- Add 100,000 random rows, roughly evenly distributed
-- across vehicle types
INSERT #Source
(
vehicle_type,
vehicle_id
)
SELECT TOP (100000)
CASE
-- Chance of being a car
WHEN RAND(CHECKSUM(NEWID(), C1.[object_id])) <= 0.3333 THEN 'C'
-- Chance of being a truck
WHEN RAND(CHECKSUM(NEWID(), C1.[object_id])) <= 0.3333 THEN 'T'
-- Chance of being an RV
ELSE 'R'
END,
-- Random vehicle id
ABS(CHECKSUM(NEWID()))
FROM master.sys.columns C1,
master.sys.columns C2,
master.sys.columns C3;
GO
-- A helpful index
CREATE INDEX nc1
ON #Source
(vehicle_type ASC, row_id ASC)
INCLUDE (vehicle_id)
WITH (FILLFACTOR = 100, MAXDOP = 1);
GO
-- Show the number of vehicles of each type
SELECT vehicle_type,
row_count = COUNT_BIG(*)
FROM #Source
GROUP BY
vehicle_type
ORDER BY
vehicle_type;
GO
SET STATISTICS TIME ON;
-- Select a random sample of rows from each group
-- Minimum 3 rows, maximum 25, 10% of the group size othewise
WITH SampleSize
AS (
-- Find the total number of vehicles of each type
-- and the sample size to use for each group
-- (3 rows, 25 rows, or 10%)
SELECT vehicle_type,
group_size = COUNT_BIG(*),
sample_size =
CASE
WHEN 0.1 * COUNT_BIG(*) > 25 THEN 25
WHEN 0.1 * COUNT_BIG(*) < 03 THEN 03
ELSE CONVERT(BIGINT, 0.1 * COUNT_BIG(*))
END
FROM #Source
GROUP BY
vehicle_type
),
RandomRows
AS (
-- Generate a random row number for each row required from
-- each group. Maximum rows returned = 25 * group_count
--
-- The TOP (9223372036854775807 (= BIGINT.Max)) is just to
-- force the query plan to calculate the random row number
-- before the final join
SELECT TOP (9223372036854775807)
SS.vehicle_type,
Random.rn
FROM SampleSize SS
CROSS
APPLY (
SELECT TOP (SS.sample_size)
rn = CONVERT(BIGINT, RAND(CHECKSUM(NEWID(), C1.[object_id])) * group_size + 1)
FROM master.sys.columns C1,
master.sys.columns C2
) Random
ORDER BY
SS.vehicle_type,
Random.rn
),
NumberedSourceRows
AS (
-- Number the source rows, restarting the numbering for each group
SELECT S.row_id,
S.vehicle_type,
S.vehicle_id,
rn = ROW_NUMBER() OVER (PARTITION BY S.vehicle_type ORDER BY S.row_id ASC)
FROM #Source S
)
-- Fetch the numbered rows that match the random row number, per group
SELECT NSR.row_id,
NSR.vehicle_type,
NSR.vehicle_id
FROM RandomRows RR
JOIN NumberedSourceRows NSR
ON NSR.vehicle_type = RR.vehicle_type
AND NSR.rn = RR.rn;
SET STATISTICS TIME OFF;
GO
IF OBJECT_ID(N'tempdb..#Source', N'U')
IS NOT NULL
DROP TABLE #Source;
GO
Paul

Nicely done but...

without exceeding 25 records for all.

--Jeff Moden

RBAR is pronounced "ree-bar" and is a "Modenism" for Row-By-Agonizing-Row.
First step towards the paradigm shift of writing Set Based code:
________Stop thinking about what you want to do to a ROW... think, instead, of what you want to do to a COLUMN.

Change is inevitable... Change for the better is not.

Helpful Links:
How to post code problems
How to Post Performance Problems
Create a Tally Function (fnTally)