This conversation resparked a curiousity on my part.
The following was tested in a 2k8 R1 environment off my local drives. End result, and if you read the notes you'll see why: No indexing, SELECT INTO. If indexing, fully prebuild it. Test yourself of course, but that's now my rule of thumb.
-- DROP TABLE CoreTable
-- DROP TABLE AggregateMe
CREATE TABLE CoreTable (ctID INT IDENTITY( 1, 1) NOT NULL,
ctNCID INT NULL,
ctAttr1 INT NOT NULL,
ctAttr2 INT NOT NULL,
CONSTRAINT PK_CoreTable PRIMARY KEY CLUSTERED (ctID)
)
GO
CREATE INDEX idx_CoreTable_Attr ON CoreTable (ctAttr1, ctAttr2)
GO
CREATE TABLE AggregateMe (ctID INT NOT NULL,
ctNCID INT NOT NULL,
amSumUpVal DECIMAL(20,4) NULL,
amAvgUpVal DECIMAL(20,4) NULL)
GO
CREATE CLUSTERED INDEX cidx_AggregateMe ON AggregateMe (ctID)
GO
-- Generate some test data:
INSERT INTO CoreTable (ctAttr1, ctAttr2)
SELECT
rn%10 AS Attr1,
rn%20 AS Attr2
FROM
(SELECT TOP 100000
Row_number() OVER (ORDER BY RAND()) AS rn
FROM
master..syscolumns AS s1
CROSS JOIN
master..syscolumns AS s2
CROSS JOIN
master..syscolumns AS s3
) AS drv
UPDATE CoreTable SET ctNCID = 100005 - ctID --(we want it in a different order)
-- TRUNCATE TABLE AggregateMe
INSERT INTO AggregateMe
SELECT
ct.ctID,
ct.ctNCID,
CONVERT( DECIMAL ( 20, 4), CHECKSUM( NEWID())) / CONVERT( DECIMAL ( 20, 4), CHECKSUM( NEWID())) AS amSumUpVal,
CONVERT( DECIMAL ( 20, 4), CHECKSUM( NEWID())) / CONVERT( DECIMAL ( 20, 4), CHECKSUM( NEWID())) AS amAvgUpVal
FROM
CoreTable AS ct
CROSS JOIN
(SELECT 1 AS n UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5
UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 0)
AS drvMultiplier
GO
select top 10 * from CoreTable
SELECT TOP 50 * FROM AggregateMe
GO
-- So, we have our baseline data in place, now, for testing. Run twice.
SET STATISTICS IO, TIME ON
PRINT 'Create Table'
CREATE TABLE #PreBuild (ctID INT, SumVal Decimal(20,4), AvgVal Decimal(20,4))
PRINT 'Insert Into'
INSERT INTO #PreBuild
SELECT
ctID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
FROM
AggregateMe AS am
GROUP BY
ctID
HAVING
SUM( AmSumUpVal) > 0
PRINT 'SELECT INTO'
SELECT
ctID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
INTO
#SelectInto
FROM
AggregateMe AS am
GROUP BY
ctID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME, IO OFF
IF OBJECT_ID( 'tempdb..#PreBuild') IS NOT NULL
DROP TABLE #PreBuild
IF OBJECT_ID( 'tempdb..#SelectInto') IS NOT NULL
DROP TABLE #SelectInto
GO
-- So, in general, no indexes and whatnot on the tables, the SELECT INTO runs at about 10-15ms faster.
-- This is insignificant in general, especially in context of this build.
-- This is only important in hyper-optimization.
-- NEXT!
-- Index discussions.
-- Note the ctNCID is at the exact same selectivity level as ctID on purpose.
-- This is a functionality optimization test, not a real-world scenario. The idea is to prove
-- the underlying theory, without having to include a heuristics discussion.
-- First, building off the clustered index in AggregateMe
SET NOCOUNT ON
PRINT 'CREATE TABLE - CLUSTERED INDEX BEFORE INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE TABLE #PreBuild (ctID INT, SumVal Decimal(20,4), AvgVal Decimal(20,4))
SET STATISTICS TIME OFF
PRINT 'CREATE INDEX - CLUSTERED INDEX BEFORE INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE CLUSTERED INDEX cidx_Prebuild ON #PreBuild (ctID)
SET STATISTICS TIME OFF
PRINT 'INSERT INTO - CLUSTERED INDEX BEFORE INSERT, Using ctID'
SET STATISTICS TIME ON
INSERT INTO #PreBuild
SELECT
ctID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
FROM
AggregateMe AS am
GROUP BY
ctID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME OFF
PRINT 'CREATE TABLE - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE TABLE #PreBuild2 (ctID INT, SumVal Decimal(20,4), AvgVal Decimal(20,4))
SET STATISTICS TIME OFF
PRINT 'INSERT INTO - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
INSERT INTO #PreBuild2
SELECT
ctID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
FROM
AggregateMe AS am
GROUP BY
ctID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME OFF
PRINT 'CREATE INDEX - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE CLUSTERED INDEX cidx_Prebuild ON #PreBuild2 (ctID)
SET STATISTICS TIME OFF
PRINT 'SELECT INTO'
SET STATISTICS TIME ON
SELECT
ctID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
INTO
#SelectInto
FROM
AggregateMe AS am
GROUP BY
ctID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME OFF
PRINT 'CREATE INDEX - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE CLUSTERED INDEX cidx_SelectInto ON #SelectInto (ctID)
SET STATISTICS TIME OFF
IF OBJECT_ID( 'tempdb..#PreBuild') IS NOT NULL
DROP TABLE #PreBuild
IF OBJECT_ID( 'tempdb..#PreBuild2') IS NOT NULL
DROP TABLE #PreBuild2
IF OBJECT_ID( 'tempdb..#SelectInto') IS NOT NULL
DROP TABLE #SelectInto
GO
-- The reason I wrapped the ON/OFF this way is to emphasize the CREATE INDEX having
-- three unique time components involved that must be summed to have the actual full
-- time it took to build the index. In this case ~150 milliseconds, much higher than
-- the extra ~20 milliseconds that it takes to insert into the table if it's pre-built.
-- Since SELECT INTO can't have the clustered pre-built, it takes less time then the
-- prebuilt table with post-built clustered, but is still more time consuming then the
-- full prebuilt.
-- With this test, we've proven that building the clustered index on the temp table
-- before inserting data that is grouped on the clustered index from the source table
-- is faster. Let's test the ctNCID, which we'll index equivalently.
--This code is the exact same as above, just using the ctNCID column instead.
SET NOCOUNT ON
PRINT 'CREATE TABLE - CLUSTERED INDEX BEFORE INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE TABLE #PreBuild (ctNCID INT, SumVal Decimal(20,4), AvgVal Decimal(20,4))
SET STATISTICS TIME OFF
PRINT 'CREATE INDEX - CLUSTERED INDEX BEFORE INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE CLUSTERED INDEX cidx_Prebuild ON #PreBuild (ctNCID)
SET STATISTICS TIME OFF
PRINT 'INSERT INTO - CLUSTERED INDEX BEFORE INSERT, Using ctID'
SET STATISTICS TIME ON
INSERT INTO #PreBuild
SELECT
ctNCID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
FROM
AggregateMe AS am
GROUP BY
ctNCID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME OFF
PRINT 'CREATE TABLE - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE TABLE #PreBuild2 (ctNCID INT, SumVal Decimal(20,4), AvgVal Decimal(20,4))
SET STATISTICS TIME OFF
PRINT 'INSERT INTO - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
INSERT INTO #PreBuild2
SELECT
ctNCID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
FROM
AggregateMe AS am
GROUP BY
ctNCID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME OFF
PRINT 'CREATE INDEX - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE CLUSTERED INDEX cidx_Prebuild ON #PreBuild2 (ctNCID)
SET STATISTICS TIME OFF
PRINT 'SELECT INTO'
SET STATISTICS TIME ON
SELECT
ctNCID,
SUM( amSumUpVal) AS SumVal,
AVG( amAvgUpVal) AS AvgVal
INTO
#SelectInto
FROM
AggregateMe AS am
GROUP BY
ctNCID
HAVING
SUM( AmSumUpVal) > 0
SET STATISTICS TIME OFF
PRINT 'CREATE INDEX - CLUSTERED INDEX AFTER INSERT, Using ctID'
SET STATISTICS TIME ON
CREATE CLUSTERED INDEX cidx_SelectInto ON #SelectInto (ctNCID)
SET STATISTICS TIME OFF
IF OBJECT_ID( 'tempdb..#PreBuild') IS NOT NULL
DROP TABLE #PreBuild
IF OBJECT_ID( 'tempdb..#PreBuild2') IS NOT NULL
DROP TABLE #PreBuild2
IF OBJECT_ID( 'tempdb..#SelectInto') IS NOT NULL
DROP TABLE #SelectInto
GO
-- Skipping the gory details, the fully pre-built STILL wins.
-- Even though it takes on average about 40ms longer to insert all the data,
-- the subsequent clustered index build takes about 180ms.
-- So, no indexing, go with SELECT INTO. If you need indexing, fully pre-build it.
Never stop learning, even if it hurts. Ego bruises are practically mandatory as you learn unless you've never risked enough to make a mistake.
For better assistance in answering your questions[/url] | Forum Netiquette
For index/tuning help, follow these directions.[/url] |Tally Tables[/url]
Twitter: @AnyWayDBA