#
#  read_text_file - very simple function
#

def read_text_file(file_path, encoding='ansi'):
    with open(file_path, 'r', encoding=encoding) as file:
        content = file.read()
    return content


#
#  write_text_file - very simple function
#

def write_text_file(file_path, content):
    with open(file_path, 'w') as file:
        file.write(content)


#
#  parse_tsql_str1 - parse T-SQL script keeping only CREATE TABLE statements
#

def parse_tsql_str1(unprocessed_str):
    stmts = unprocessed_str.split('GO')
    create_table_stmts = [stmt.strip() for stmt in stmts if stmt.strip().upper().startswith('CREATE TABLE')]
    processed_str = '\nGO\n'.join(create_table_stmts)
    return processed_str


#
#  split_statements - split T-SQL script into individual statements
#

def split_statements(tsql_string):
    statements = tsql_string.split('GO\n')
    return [stmt.strip() for stmt in statements if stmt.strip()]


#
#  join_statements - join T-SQL individual T-SQL statements into a string
#

def join_statements(statements):
    processed_str = '\nGO\n'.join(statements)
    return processed_str


#
#  process_tsql_str2 - rules to convert T-SQL to Fabric SQL
#

import re

def process_tsql_str2(tsql_string):

    # Split the input string on "GO" keyword
    statements = split_statements(tsql_string)
    
    processed_statements = []

    for statement in statements:

        # Remove unwanted clauses and options
        statement = re.sub(r'ON \[PRIMARY\]', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\bTEXTIMAGE_ON \[PRIMARY\]\b', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\)WITH.*$', ')', statement, flags=re.IGNORECASE | re.DOTALL)
        statement = re.sub(r'\bIDENTITY\b\(\d,\s?\d\)', '', statement, flags=re.IGNORECASE | re.DOTALL)
        statement = re.sub(r'\bROWGUIDCOL\b', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\bNOT FOR REPLICATION\b', '', statement, flags=re.IGNORECASE)
        
        # Replace data types and options
        statement = re.sub(r'\[tinyint\]', '[smallint]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[sysname\]', '[varchar](128)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[nvarchar\]', '[varchar]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[nchar\]', '[char]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\(max\)', '(4000)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[datetime\]', '[datetime2](6)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[datetimeoffset\]', '[datetime2]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[money\]', '[decimal](18, 4)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\bas\b.*', '[varchar](4000),', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[xml\].*,', '[varchar](4000),', statement, flags=re.IGNORECASE)
        
        # Replace specific column definitions
        statement = re.sub(r'\[dbo\]\.\[AccountNumber\]', '[varchar](15)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[dbo\]\.\[Flag\]', '[bit]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[dbo\]\.\[Name\]', '[varchar](50)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[dbo\]\.\[NameStyle\]', '[bit]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[dbo\]\.\[OrderNumber\]', '[varchar](25)', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\[dbo\]\.\[Phone\]', '[varchar](25)', statement, flags=re.IGNORECASE)
        
        # Remove constraints  
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+CHECK\s+\(.*?\)', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+DEFAULT\s+\(.*?\)', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+UNIQUE\s+NONCLUSTERED\s+\(\s*(?:\[(\w+)\] (\w+),\s*)*(?:\[(\w+)\] (\w+))\s*\)', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+UNIQUE\s+CLUSTERED\s+\(\s*(?:\[(\w+)\] (\w+),\s*)*(?:\[(\w+)\] (\w+))\s*\)', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+FOREIGN\s+KEY\s+\(.*?\)\s+REFERENCES\s+\[.*?\]\.\[.*?\]\(.*?\)', '', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+PRIMARY\s+KEY\s+CLUSTERED\s+\(\s*(?:\[(\w+)\] (\w+),\s*)*(?:\[(\w+)\] (\w+))\s*\)', ')', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CONSTRAINT\s+\[.*?\]\s+PRIMARY\s+KEY\s+NONCLUSTERED\s+\(\s*(?:\[(\w+)\] (\w+),\s*)*(?:\[(\w+)\] (\w+))\s*\)', ')', statement, flags=re.IGNORECASE)

        # Change schemas
        statement = re.sub(r'CREATE TABLE \[SalesLT\]', 'CREATE TABLE [raw]', statement, flags=re.IGNORECASE)
        statement = re.sub(r'CREATE TABLE \[dbo\]', 'CREATE TABLE [raw]', statement, flags=re.IGNORECASE)

        # Clean up alignment, commas, and spaces
        statement = re.sub(r'NOT NULL', 'NULL', statement, flags=re.IGNORECASE)
        statement = re.sub(r'\]\(\s', ']\n(', statement)
        statement = re.sub(r',\s \)', '\n)', statement)
        statement = re.sub(r'[ ]+', ' ', statement)
        processed_statements.append(statement)
    
    # Join the processed statements with "GO" keyword
    return join_statements(processed_statements)


#
#  replace_extension - replace file extension
#

import os

def replace_extension(filename, new_extension):
    base = os.path.splitext(filename)[0]
    return base + new_extension



#
#  create_notebook - write out ipython notebook
#

import nbformat as nbf 

# create a jupyter notebook from the tsql statements
def create_notebook(statements, code_snippet, notebook_file):

    # tagging for fabric
    fabric = {
        'language_info': {'name': 'sql'}, 
        'microsoft': {'language': 'sql', 'language_group':'sqldatawarehouse'},
        'kernel_info': {'name':'sqldatawarehouse'},
        'kernelspec': {'name':'sqldatawarehouse', 'language':'sqldatawarehouse', 'display_name':'sqldatawarehouse'}
    }

    # new notebook
    nb = nbf.v4.new_notebook(metadata=fabric)

    # start with empty notebook
    nb['cells'] = []

    # number code blocks in markdown
    cnt = 1

    # for each statement, create a markdown and code cell
    for stmt in statements:

        # grab first line of code block, split on spaces
        first = stmt.split('\n')[0]
        parts = first.split(' ')

		# make up orginal table name
        org_table_name = parts[2:][0] 

        # create a title
        title = '<font size="5"> Table ' + str(cnt) + ' - ' + org_table_name + '</font>'

        # add markdown
        nb['cells'].append(nbf.v4.new_markdown_cell(title))

        # add extra code
        code = code_snippet.replace("{x}", org_table_name).replace("{y}", stmt)
            
        # add code cells
        nb['cells'].append(nbf.v4.new_code_cell(code, metadata={'language_info': {'name': 'sql'}}))

        # increment the counter
        cnt += 1


    # overwrite the notebook
    with open(notebook_file, 'w') as f:
        nbf.write(nb, f)


#
#  Main program
#

# Change working directory
path = r"C:\Users\jminer\OneDrive - Insight\Desktop\SSC\SSC - Article 21\GEN-AI"
os.chdir(path)


# grab file names
input_file = 'advwrks2012lt.sql'
output_file = 'advwrks2012lt.tmp1'

# stage 1 - read input file, keep create table stmts, and write output file
input_content = read_text_file(input_file, 'utf-16')
output_content = parse_tsql_str1(input_content)
write_text_file(output_file, output_content) 


# grab file names
input_file = 'advwrks2012lt.tmp1'
output_file = 'advwrks2012lt.tmp2'

# stage 2 - read input file, keep create table stmts, and write output file
input_content = read_text_file(input_file, 'utf-8')
output_content = process_tsql_str2(input_content)
write_text_file(output_file, output_content) 


# grab file names
input_file = 'advwrks2012lt.tmp2'
output_file = 'advwrks2012lt.ipynb'

# code template
snippet_file = 'sql-snippet-01.txt'
code_snippet = read_text_file(snippet_file)  

# stage 3 - read input file, keep create table stmts, and write output file
statements = split_statements(output_content)
create_notebook(statements, code_snippet, output_file)
            
