Removing invalid email addresses

  • I have a 4 million row table with emails from different sources.

    I need to clean it up and only have valid email addresses like

    abc@abc.com anything else has to be deleted.

    Here is the sample of data in my table.

    liaosdnjgabgsdg

    ;5252@fdsgg

    3264272

    dsgsadhaf.com

    @ouasfj.com

    There are more weird entries.

    I've already Ltrim and Rtrim the fields.

    How can i clean this table ?

    THANK YOU

     

    Alex S
  • Here's a suggestion:

    - Get the DBA Toolkit at http://www.sqlservercentral.com/columnists/mcoles/sql2000dbatoolkitpart2.asp

    - You can validate email addresses based on a regular expression similar to the following:

    SELECT dbo.fn_regex_match ('^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$', 'validemail@sqlservercentral.com', 'I+')

    If you're using SQL 2005 you can use SQLCLR to create a regex function instead and CROSS APPLY to execute the function directly on the columns.

    With SQL 2000 you'll need to play around with it.

  • Hi,

    you can use the regular expression with a user function like this:

    --select dbo.fnTestStringWithRegularExpression('pippo123', 4, 'T', 'T')

    ALTER         function [dbo].[fnTestStringWithRegularExpression]

     (

      @stringaDaControllare varchar(4096),      -- stringa da controllare

      @tipoStringa smallint,                  -- 1 = email

                                        -- 2 = numero di telefono italiano

                                        -- 3 = numero di telefono US

                                        -- 4 = stringa di lettere e numeri di almeno 8 caratteri

      @ricercaGlobale char(1),                -- T = cerca tutte le occorrenze

                                        -- F = cerca solo la prima occorrenza

      @ignoreCaseSensitive char(1)             -- T = ricerca ignorando case sensitive

                                        -- F = ricerca senza ignorare case sensitive

    &nbsp

    returns char(1)                         -- T = stringa da controllare ok

                                        -- F = stringa da controllare non corretta

    as

     begin

      -- espressioni di controllo in funzione del tipo di stringa da verificare

      declare @espressioneDiControllo varchar(1048)

      set @espressioneDiControllo = ''

      if (@tipoStringa = 1)

       begin

        -- email

        -- non accetta abc_@abc.it

        -- set @espressioneDiControllo = '^([a-zA-Z0-9_\-])+(\.([a-zA-Z0-9_\-])+)*@((\[(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5]))\]))|((([a-zA-Z0-9])+(([\-])+([a-zA-Z0-9])+)*\.)+([a-zA-Z])+(([\-])+([a-zA-Z0-9])+)*))$'    --'^(([A-Za-z0-9]+_+)|([A-Za-z0-9]+\-+)|([A-Za-z0-9]+\.+)|([A-Za-z0-9]+\++))*[A-Za-z0-9]+@((\w+\-+)|(\w+\.))*\w{1,63}\.[a-zA-Z]{2,6}$'

        -- ==============================================================================================================================

        set @espressioneDiControllo = '^([a-zA-Z0-9_\-])+(\.([a-zA-Z0-9_\-])+)*@((\w+\-+)|(\w+\.))*\w{1,63}\.[a-zA-Z]{2,6}$'

       end

      if (@tipoStringa = 2)

       begin

        -- numero di telefono italiano

        set @espressioneDiControllo = '^([0-9]*\-?\ ?\/?[0-9]*)$'

       end

      if (@tipoStringa = 3)

       begin

        -- numero di telefono US

        set @espressioneDiControllo = '^(??<1>[(])?(?<AreaCode>[2-9]\d{2})(?(1)[)])(?(1)(?<2>[ ])|(??<3>[-])|(?<4>[ ])))?)?(?<Prefix>[1-9]\d{2})(?(AreaCode)(??(1)(?(2)[- ]|[-]?))|(?(3)[-])|(?(4)[- ]))|[- ]?)(?<Suffix>\d{4})$'

       end

      if (@tipoStringa = 4)

       begin

        -- stringa di lettere e numeri di almmeno 8 caratteri

        set @espressioneDiControllo = '(?!^[0-9]*$)(?!^[a-zA-Z]*$)^([a-zA-Z0-9]{8,20})$'

       end

      if (@espressioneDiControllo <> '')

       begin

        -- esegue il controllo

        declare @hr integer

        declare @objRegExp integer

        declare @results bit

        declare @output char(1)

        

        -- creazione oggetto com per le regular expression

        exec @hr = sp_OACreate 'VBScript.RegExp', @objRegExp output

      

        if (@hr <> 0)

         begin

          set @results = 0

      

          goto USCITA

         end

         

        -- imposta le proprietà dell'oggetto com

        -- pattern rispetto cui testare

        exec @hr = sp_OAsetProperty @objRegExp, 'Pattern', @espressioneDiControllo

      

        if (@hr <> 0)

         begin

          set @results = 0

      

          goto USCITA

         end

      

        -- verifica globale o si ferma alla prima occorrenza

        if (@ricercaGlobale = 'T')

         begin

          exec @hr = sp_OAsetProperty @objRegExp, 'Global', True

         end

        else

         begin

          exec @hr = sp_OAsetProperty @objRegExp, 'Global', False

         end

      

        if (@hr <> 0)

         begin

          set @results = 0

      

          goto USCITA

         end

      

        -- ignora le lettere maiuscole

        if (@ignoreCaseSensitive = 'T')

         begin

          exec @hr = sp_OAsetProperty @objRegExp, 'IgnoreCase', True

         end

        else

         begin

          exec @hr = sp_OAsetProperty @objRegExp, 'IgnoreCase', False

         end

      

        if (@hr <> 0)

         begin

          set @results = 0

      

          goto USCITA

         end

      

        -- esegue il controllo della stringa chiamando il metodo Test

        exec @hr = sp_OAMethod @objRegExp, 'Test', @results output, @stringaDaControllare

      

        if (@hr <> 0)

         begin

          set @results = 0

      

          goto USCITA

         end

      

        -- distrugge l'oggetto com

        exec @hr = sp_OADestroy @objRegExp

      

        if (@hr <> 0)

         begin

          set @results = 0

      

          goto USCITA

         end

       end

      else

       begin

        -- tipologia non prevista

        set @results = 0

        goto USCITA

       end

    USCITA:

      if (@results = 1)

       begin

        set @output = 'T'

       end

      else

       begin

        set @output = 'F'

       end

      return @output

     end

  • I can't comment on the proposed solutions, but if my email address

    is in your database, it's in the format

    myname@agency.state.statecode.us

    I find it extremely annoying when I try to submit that as my email

    address, and am told that it's invalid.

    Mattie

     

     

  • Hi,

    i test my user function and i reveived OK:

    select dbo.fnTestStringWithRegularExpression('myname@agency.state.statecode.us', 1, 'T', 'T')

    tell T, so the email address is ok.

    Bye

  • Two comments about the sp_OACreate method -

    1) VBScript doesn't support many regex features that are standard in Perl (lookbehind, atomic grouping, possessive quantifiers, \A, \Z, etc.) 

    2) sp_OACreate can cause memory leaks:

    http://support.microsoft.com/kb/897572

    http://support.microsoft.com/kb/816937/EN-US/

    Make sure you have SP 4 installed and that sp_Destroy is getting called every time if you use this method.  I've seen it shut down production servers on a daily basis.

  • --I don't know the full rules, but here's some simple SQL code
    --which does at least some of it.
    --It could be used in a where clause or case statement,
    --inside a function or not.
    --More conditions can easily be added along similar lines,
    --so this ought to be able to do what you want with some tweaking.
    --
    --This allows any number of dots, and one @.
    --One of the dots must be after the @.
    --
    --For all dots, and the @:
    --none of these may be at the beginning or end of the string,
    --and no two may be adjacent.
    --
    --All other characters must be alphanumeric or _ or -
     

    declare

    @vc table(vc varchar(50))

    insert

    @vc

    select

    'dhskjdhfkss' union all

    select

    'dhsk@fkss' union all

    select 'dhsk@fkss.' union all

    select ';dhsk@fkss.ooo' union all

    select

    'dh-sk@fk-ss.ooo' union all

    select

    'dh\sk@fk-ss.ooo' union all

    select

    '@fkss.das' union all

    select

    'dhsk@fkss.a.asd.asd.aaaaaa' union all

    select

    'dhsk@fkss.ddd..dddd' union all

    select

    'dh+k@fkss.ddd.d' union all

    select

    'dhsk@fkss.ddd.kk.dd' union all

    select

    'dhsk@f#ss.ddd' union all

    select

    'd--k@f_ss.ddd' union all

    select

    '------@__-ss.___._._._' union all

    select

    'dhsk@f#ss.ddd' union all

    select

    'dh_k@fkss.ddd' union all

    select

    'dhsk@fkss.' union all

    select

    '.dhsk@fkss.nnn' union all

    select

    'sgsgaha.sksjkajd.d.d.dhsk@fkss.nnn.dsdsd.as' union all

    select

    'd.h.s.k@f.k.s.s.f' union all

    select

    'd@f.s' union all

    select

    'd.hs@fkss.d' union all

    select

    'd.sk.@ggg.fkss.ddd'

    --

    select

    vc.vc VALID____________________________

    from

    @vc vc

    where

    not vc.vc like '%[^a-Z0-9.@!_-]%' escape '!' --contains only alphanumeric, '-', '.' and '@'

    and

    not vc.vc like '%@%@%' --has no more than one '@' sign

    and

    not vc.vc like '%..%' --no adjacent dots

    and

    not vc.vc like '.%' --no leading dot

    and not vc.vc like '%.' --no trailing dot

    and vc.vc like '%[^.]@[^.]%._%' --has an '@' without dots next to it, and to its right a dot followed by another char.

    --

    select

    vc.vc INVALID__________________________

    from

    @vc vc

    where

    vc.vc like '%[^a-Z0-9.@!_-]%' escape '!' --contains only alphanumeric, '-', '.' and '@'

    or

    vc.vc like '%@%@%' --has no more than one '@' sign

    or

    vc.vc like '%..%' --no adjacent dots)

    or

    vc.vc like '.%' --no leading dot

    or vc.vc like '%.' --no trailing dot

    or

    not vc.vc like '%[^.]@[^.]%._%' --has an '@' without dots next to it, and a dot afterwards.

    Tim Wilkinson

    "If it doesn't work in practice, you're using the wrong theory"
    - Immanuel Kant

  • THANK YOU ALL FOR RESPONDING.

    Alex S

Viewing 8 posts - 1 through 7 (of 7 total)

You must be logged in to reply to this topic. Login to reply