Technical Article

A google like search


 The problem:

We must implement a google like search over a existing database. The database contains several tables for persons, corporations, organizations, addresses, etc.
It must be a custom solution (no thirdy-party solutions neither SQL FTS).
Its for a new web application (dozen to hundreds simultaneous users, response time under 1s for most cases, 5s for worst case).
We ill create a single table (registers) to point to the existing tables. We ill need also a table of tags and a relationship table (registers x tags).
A batch ill scan the relevant tables (persons, organizations, addresses, etc) and populate the tables (registers, tags, tag_register) once a day.
The search ill be performed in all registers at once. 
A "register" is a abstraction for a person a organization or any other object.
The object's data lie in a "main" table (persons table, organizations table, etc)
and in many "members" tables (addresses, contacts, profile, etc).
A "tag" is a word wich ocurrs in any of the table's text fields.
In a search for tag1, tag2 and tag3
If exists: tag1 AND tag2-100 ocurrences, tag1 AND tag3-10 ocurrences
and there's no ocurrences of tag2 and tag3 for the any register we ignore tag3
since this returns mores results than ignoring tag2.
Some objects share many properties and the search can use some properties to filtering and/or ordering,
in this example we ill simple use the name property to populate a column (vc_register) to be used for ordering.
Below is the script to create the tables and procedure and populate the database with dummy data.
The procedure performs the search in a reasonable time and was implementd as a conceptual proof.
We gave a try to table variables and creating/droping indexes in temporary tables but we found no indexed temporary tables worked better.
Also the FK constraints increase performance is better than we can have guessed.
Increasing the number of tags dont impact in performance, increasing the number of registers dont impact directly in performance.
If a single tag is present in millions of registers returning all matches for it can slow down the response time.
The response time is dicted by the number os comparasions to find the intersection between to matches (join each tag search result).
Chosing wisely the words from the database to populate the tags table ill impact directly in performance.
Words like "the", "and", "a" must be avoided (low meaning, high ocurrence)
We ill must keep a eye in the most common used tags and the tags with the highest ocurrences.
/* this table points to several production tables *//* +20 columns ill be added in the final version to implement filters and order by options */create table dbo.register(
 id_register int not null identity(1,1) primary key --A unique id to gather several rows from several tables helps a lot
,id_from_table int not null
,it_table char(1) not null
,vc_register varchar(200) not null -- used to order by de final results
create nonclustered index [ix_register-vc_register] on dbo.[register](vc_register);

/* contains all words can be found in all fields from all tables */create table dbo.tag(
id_tag int identity(1,1) not null primary key nonclustered
,vc_tag varchar(50) not null -- clustered for performance
,ocurrences int not null --A count of the ocurrences of the tag, used do performance and to maximize results when one or more of the tags causes the entire search to not return matches
create clustered index [uix_tag-vc_tag] on dbo.[tag](vc_tag);

/* relation between words and ocurrences in the registers */create table dbo.tag_register(
id_register int not null            --clustered index
,id_tag int not null                --unique
create clustered index [uix_tag_register-id_register] on dbo.[tag_register](id_register);
create nonclustered index [ix_tag_register-id_tag] on dbo.[tag_register](id_tag);
create unique index [uix_tag_register] on dbo.[tag_register](id_register,id_tag);
alter table dbo.[tag_register] add constraint [FK_tag_register-tag] FOREIGN KEY (id_tag) references dbo.[tag] (id_tag);
alter table dbo.[tag_register] add constraint [FK_tag_register-register] FOREIGN KEY (id_register) references dbo.[register] (id_register);

/* implements a "tag's cloud" */
create table dbo.tag_dia(
id_tag int not null
,dia date not null
,qtd smallint not null

--create type dbo.IntArray as table
--( id int primary key nonclustered)


/* search for words in millions of rows *//* rules: *//* 1.must accept n-terms to search *//* 2.must performe the search *//* in multiple tables at once *//* 3.performe not inclusive search *//* (term1 AND term2 AND ...) in opposition *//* to (term1 OR term2 OR ...) *//* 4.if a term causes the entire search to *//* fail ignore it *//* 5.response time must be less than 1s for *//* most cases, 5s to the worst scenario *//* (intersection of multiple 100k matches) */
create procedure dbo.SearchExclusive(@SearchTerm varchar(max))
as begin
 set nocount on
    declare @id_tag int--, @registers IntArray, @registers_temp IntArray;
    create table #registers (id int);

    --loop for each search term
    DECLARE SearchTermCursor cursor FAST_FORWARD READ_ONLY for
        select t.id_tag from tag t
        join dbo.Split(@SearchTerm,';') s on s.vc_item = t.vc_tag
        order by t.ocurrences desc
    OPEN SearchTermCursor
    FETCH next from SearchTermCursor into @id_tag -- fetch the first row

    if (@@FETCH_STATUS = 0)--if first row ok
        insert into #registers select tr.id_register from dbo.tag_register tr where tr.id_tag = @id_tag
        --create clustered index [uixid] on #registers(id)
        FETCH next from SearchTermCursor into @id_tag -- fetch the second row
    WHILE (@@FETCH_STATUS = 0 and exists(select id from #registers)) -- fetch # row
     create table #registers_temp(id int)
        insert into #registers_temp
         select tr.id_register
         from dbo.tag_register tr
         join #registers r on = tr.id_register
         where tr.id_tag = @id_tag

        --if this particular tag makes no match to be returned then ignore it
        if exists(select id from #registers_temp)
            truncate table #registers; -- erase
            --drop index #registers.[uixid]
            --create clustered index [uixid_temp] on #registers_temp(id)
            insert into #registers select from #registers_temp t
            --create clustered index [uixid] on #registers(id)        

        drop table #registers_temp

    FETCH next from SearchTermCursor into @id_tag

    close SearchTermCursor
    deallocate SearchTermCursor
    --create clustered index [uixid] on #registers(id)
    --return ill be paged

    select top 200 r.* from dbo.register r
    join #registers t on = r.id_register
    order by r.vc_register
    drop table #registers

 --drop table #registers_temp

 set nocount off


/* Correctness testing data */
insert into register values(0,'','abc3')
insert into register values(0,'','abc2')
insert into register values(0,'','abc1')
insert into tag values('abc',3);--tag 1
insert into tag_register values(1,1);
insert into tag_register values(2,1);
insert into tag_register values(3,1);
--select * from tag_register

insert into register values(0,'','def1')
insert into register values(0,'','def2')
insert into register values(0,'','def3')
insert into register values(0,'','def4')
insert into tag values('def',4);--tag 1
insert into tag_register values(4,2);
insert into tag_register values(5,2);
insert into tag_register values(6,2);
insert into tag_register values(7,2);

insert into register values(0,'','abcdef1')
insert into register values(0,'','abcdef3')
insert into register values(0,'','abcdef2')
insert into register values(0,'','abcdef4')
insert into tag_register values(8,2);
insert into tag_register values(9,2);
insert into tag_register values(10,2);
insert into tag_register values(11,2);
insert into tag_register values(8,1);
insert into tag_register values(9,1);
insert into tag_register values(10,1);
insert into tag_register values(11,1);

insert into tag values('qwerty',1000);
insert into tag values('xyz',10000);

--select * from tag
insert into tag values('cross alfa',100000);
insert into tag values('cross beta',100000);
insert into tag values('cross gamma',100000);
insert into tag values('cross delta',100000);


/* perfomance testing data */set nocount on

declare @id int
set @id = (select COUNT(*) from tag)
while(@id < 10000)
 set @id = @id +1;
 insert into tag values('tag '+CAST(@id as varchar(10)), 0)


--decreate the clustered indexes
--inserting can take several minutes to populate thousands rows
drop index dbo.[tag_register].[uix_tag_register-id_register];
drop index dbo.[tag_register].[ix_tag_register-id_tag];


declare @id int
set @id = (select COUNT(*) from register)
while(@id < 2000000)

 set @id = @id +1;
 insert into register values(0,'','cross'+CAST(@id as varchar(10)))

 insert into tag_register values(@id,5);
 insert into tag_register values(@id,6);
 insert into tag_register values(@id,7);
 insert into tag_register values(@id,8);



create clustered index [uix_tag_register-id_register] on dbo.[tag_register](id_register);
create nonclustered index [ix_tag_register-id_tag] on dbo.[tag_register](id_tag);

drop index dbo.[tag].[uix_tag-vc_tag];


declare @idt int, @idr int
set @idt = 1000
while(@idt < 2000)

 set @idr = 1000
 set @idt = @idt +1;
    while(@idr < 2000)

 set @idr = @idr +1;
     insert into tag_register values(@idr,@idt);


create clustered index [uix_tag-vc_tag] on dbo.[tag](vc_tag);

set nocount off

/* testing */
exec dbo.SearchExclusive 'none?;';--must return no results

exec dbo.SearchExclusive 'abc;';

exec dbo.SearchExclusive 'def;none';--must ignore none

exec dbo.SearchExclusive 'def;abc;';

exec dbo.SearchExclusive 'def;abc;qwerty;';

exec dbo.SearchExclusive 'qwerty;';

exec dbo.SearchExclusive 'cross alfa';

exec dbo.SearchExclusive 'cross alfa;cross beta;cross gamma;cross delta';

exec dbo.SearchExclusive 'cross alfa;cross beta;fnord;qwerty;def;abc;qwerty;cross gamma;cross delta;';

exec dbo.SearchExclusive 'tag 1001;tag 1002;tag 1003;tag 1003;tag 1004;tag 1005;tag 1006;tag 1007;tag 1008;tag 1009;tag 1010;tag 1011;tag 1012;tag 1013;tag 1014;tag 1015;tag 1016;tag 1017;tag 1018;tag 1019;tag 1020;';


3.29 (7)

You rated this post out of 5. Change rating




3.29 (7)

You rated this post out of 5. Change rating