Technical Article

Comparing Stored Procedures, Part 5

,

This picks up from an earlier article on trying to quantify the comparison of two stored procedures. This script creates a stored procedure which contains a critical code optimization to an earlier version..

For a more complete explanation, please go to my blog entry here.

Written by Jesse McLain

jesse@jessemclain.com

www.jessemclain.com

http://jessesql.blogspot.com

/*************************************************************************************************
**        Proc: "spd_SequenceCompare_Version2"
**        Desc: This stored procedure will compare the items in two sequences. The sequences should
**              be stored in two tables, Seq1 and Seq2, which should have the following structure:
**              
**              CREATE TABLE Seq1 (
**                CodeLineTxt varchar(max), /* stores the original line of code */**                CodeLineNum int not null identity(1,1), /* stores the line number */**                MatchLineNum int /* stores the matching line of code from spd #2 */**              )
**              
**              This spd will compare smaller and smaller subsequences in the tables, and when it
**              finds a match, it will note it in the column MatchLineNum. 
**              
**              
**              
**        Return values: none
** 
**        Called by:   
**              
**        Parameters:
**        Input
**        ----------
**        none
**
**        Output
**        -----------
**        none
**
**        Auth:  Jesse McLain
**        Email: jesse@jessemclain.com
**        Web:   www.jessemclain.com
**        Blog:  http://jessesql.blogspot.com/2009/02/comparing-spds-part-5-optimization.html
**
**        Date:  02/16/2008
**
***************************************************************************************************
**        Change History
***************************************************************************************************
**        Date:        Author:                Description:
**        --------    --------            -------------------------------------------
**        20080216    Jesse McLain        Created script
**        20080223    Jesse McLain        Added CTE code to better determine @CmprSz value
***************************************************************************************************
**************************************************************************************************/
CREATE PROCEDURE spd_SequenceCompare_Version2 AS

SET NOCOUNT ON

DECLARE @CmprSz int
DECLARE @idx1 int
DECLARE @idx2 int
DECLARE @idx2_last int
DECLARE @idx3 int
DECLARE @NumCmprs int
DECLARE @cdtxt1 varchar(max); 
DECLARE @AllMatch tinyint; 
DECLARE @Seq1Size int;        SELECT @Seq1Size = COUNT(*) FROM Seq1
DECLARE @Seq2Size int;        SELECT @Seq2Size = COUNT(*) FROM Seq1

/* 2/17/09 - I'm going to try out several enhancements to improve performance of this
spd. First I'm going to run a pre-flight matching between the members of the two
sequences, and then omit any subsequence checks with non-matching members. What this 
would also allow is lowering the 'compare size' (@CmpSz) to the length of the maximum 
subsequence without any non-matches (actually the lesser of the max between the two
sequences - CORRECTION - the max lens of the matching subsequences will be the same
between the 2 main sequences). This is where our greatest cost savings will be. */


/* first we'll update each sequence so that the MatchLineNum column for every value
of CodeLineTxt will indicate whether there exists a match in the other sequence. */UPDATE Seq1 
SET MatchLineNum = CASE 
    WHEN EXISTS (SELECT 1 FROM Seq2 WHERE Seq2.CodeLineTxt = Seq1.CodeLineTxt) THEN -1 
    ELSE 0 END

UPDATE Seq2
SET MatchLineNum = CASE 
    WHEN EXISTS (SELECT 1 FROM Seq1 WHERE Seq1.CodeLineTxt = Seq2.CodeLineTxt) THEN -1 
    ELSE 0 END



/* 2/23/09 - this block of CTE code will determine the max lengths of subsequences of
the main sequences containing matching items in the other sequence. Previously we would
set @CmprSz to the size of the smaller of the two sequences. What we're doing now is
checking all of the possible matching subsequences (by checking for matches above in
a one-to-one fashion), taking the size of the longest possible subsequence

this page helped with the CTE code: http://www.sqlservercentral.com/articles/T-SQL/61461/ */
;WITH 
Seq1starts AS (
    SELECT CodeLineNum 
    FROM Seq1 S1
    WHERE MatchLineNum = -1
    AND NOT EXISTS (SELECT * FROM Seq1 S2 
        WHERE MatchLineNum = -1
        AND S2.CodeLineNum = S1.CodeLineNum - 1
    )
),
Seq1ends AS (
    SELECT CodeLineNum = CodeLineNum
    FROM Seq1 S1
    WHERE MatchLineNum = -1
    AND NOT EXISTS (SELECT * FROM Seq1 S2 
        WHERE MatchLineNum = -1
        AND S2.CodeLineNum = S1.CodeLineNum + 1
    )
),
Seq1sequences AS (
    SELECT 
        SeqStartsAt = Seq1starts.CodeLineNum,
        SeqEndsAt = (SELECT MIN(Seq1ends.CodeLineNum) 
            FROM Seq1ends 
            WHERE Seq1ends.CodeLineNum >= Seq1starts.CodeLineNum)
    FROM Seq1starts
),
Seq1maxlenseq AS (
    SELECT MaxLen = MAX(SeqEndsAt - SeqStartsAt + 1)
    FROM Seq1sequences
),
Seq2starts AS (
    SELECT CodeLineNum 
    FROM Seq2 S1
    WHERE MatchLineNum = -1
    AND NOT EXISTS (SELECT * FROM Seq2 S2 
        WHERE MatchLineNum = -1
        AND S2.CodeLineNum = S1.CodeLineNum - 1
    )
),
Seq2ends AS (
    SELECT CodeLineNum = CodeLineNum
    FROM Seq2 S1
    WHERE MatchLineNum = -1
    AND NOT EXISTS (SELECT * FROM Seq2 S2 
        WHERE MatchLineNum = -1
        AND S2.CodeLineNum = S1.CodeLineNum + 1
    )
),
Seq2sequences AS (
    SELECT 
        SeqStartsAt = Seq2starts.CodeLineNum,
        SeqEndsAt = (SELECT MIN(Seq2ends.CodeLineNum) 
            FROM Seq2ends 
            WHERE Seq2ends.CodeLineNum >= Seq2starts.CodeLineNum)
    FROM Seq2starts
),
Seq2maxlenseq AS (
    SELECT MaxLen = MAX(SeqEndsAt - SeqStartsAt + 1)
    FROM Seq2sequences
)
SELECT @CmprSz = CASE WHEN Seq1maxlenseq.MaxLen < Seq2maxlenseq.MaxLen THEN Seq1maxlenseq.MaxLen ELSE Seq2maxlenseq.MaxLen END
FROM Seq1maxlenseq, Seq2maxlenseq

UPDATE Seq1 SET MatchLineNum = 0
UPDATE Seq2 SET MatchLineNum = 0

/* IF @CmprSz IS NULL then we know that no matches exist between the sequences, and can thus exit early */IF @CmprSz IS NULL
    RETURN

-- here for testing; remove later:
PRINT '@CmprSz = ' + LTRIM(STR(@CmprSz))




/* 2/16/09 - the idea here is to compare blocks of lines from one table to  
   blocks from the other table. we start with the blocks of largest size
   and continue decreasing size until we get to blocks of one line each. 
   Whenever a block matches that of a block in the other table, all 
   matching lines are flagged to indicate the match, so as to exclude 
   those matches from subsequent comparisons. 
*/
WHILE @CmprSz > 0
BEGIN
  SET @NumCmprs = @Seq1Size - @CmprSz + 1
  SET @idx1 = 1

  WHILE @idx1 <= @NumCmprs
  BEGIN
    /* ensure that every element of subsequence is not yet matched: */    IF (SELECT COUNT(*) FROM Seq1 WHERE CodeLineNum BETWEEN @idx1 AND (@idx1 + @CmprSz - 1) AND MatchLineNum = 0) = @CmprSz
    BEGIN
      SELECT @cdtxt1 = CodeLineTxt FROM Seq1 WHERE CodeLineNum = @idx1

      /* we need to find the first match in table 2. since there can be multiple rows in table 2 that match
      the head of the subset from table 1, we need to check for multiples: */      SET @idx2_last = 0
      SET @idx2 = 0

      SET @AllMatch = 0
  
      WHILE @AllMatch = 0 AND @idx2 IS NOT NULL
      BEGIN
        SELECT @idx2 = MIN(CodeLineNum) 
          FROM Seq2 
          WHERE CodeLineTxt = @cdtxt1
          AND CodeLineNum > @idx2_last
          AND CodeLineNum <= (@Seq2Size - @CmprSz + 1) /* ensures that table 2 subset is big enough */          AND MatchLineNum = 0

        IF @idx2 IS NOT NULL
        BEGIN
          SET @idx3 = 1
 
          /* now check all matches. the easiest way to implement this checking is to first 
          assume that all lines match, and then record that fact that at least one does not. */          SET @AllMatch = 1
          WHILE @idx3 <= @CmprSz AND @AllMatch = 1
          BEGIN
            IF NOT EXISTS (SELECT 1 
              FROM Seq1 T1 
              JOIN Seq2 T2 
              ON T1.CodeLineTxt = T2.CodeLineTxt 
              AND T1.CodeLineNum = (@idx1 + @idx3 - 1)
              AND T2.CodeLineNum = (@idx2 + @idx3 - 1))

              SET @AllMatch = 0
 
            SET @idx3 = @idx3 + 1
          END

          IF @AllMatch = 1
          BEGIN
            UPDATE Seq1 SET MatchLineNum = @idx2
              WHERE CodeLineNum BETWEEN @idx1 AND (@idx1 + @CmprSz - 1)

            UPDATE Seq2 SET MatchLineNum = @idx1
              WHERE CodeLineNum BETWEEN @idx2 AND (@idx2 + @CmprSz - 1)
          END
        END

        SET @idx2_last = @idx2
      END /* WHILE @AllMatch = 1 AND @idx2 IS NOT NULL */    END /* IF (SELECT COUNT(*) FROM Seq1 WHERE CodeLineNum BETWEEN ... */
    SET @idx1 = @idx1 + 1
  END /* WHILE @idx1 <= @NumCmprs */
  SET @CmprSz = @CmprSz - 1
END /* WHILE @CmprSz > 0 */
RETURN

Rate

You rated this post out of 5. Change rating

Share

Share

Rate

You rated this post out of 5. Change rating