TidierOrg / TidierDB.jl

Tidier database analysis in Julia, modeled after the dbplyr R package.
MIT License
35 stars 3 forks source link

Parsing Interpolation #22

Open drizk1 opened 2 months ago

drizk1 commented 2 months ago

Currently to use !! interpolation, a user needs to write the below

add_interp_param!(:var_name, value::{symbol, vest, string, float etc})
# allows
columns = [:mtcars, :mpg] # or as string, just 1 number etc
add_interp_param!(var_name, columns)
@chain begin
        db_table(con, :My_table)
        @filter(!!var_name) # and var name is replaced.
        @show_query
    end

However, ideally, TidierDB would replicate TidierData.jl's parsing (no add_interp_param! for ex)

var_name = value # can get symbol, vector, string, float etc
@select(df, !!var_name) 

Initially, I thought I had this working but using eval causes scoping issues when it is exported as part of a package so it doesnt work (!!var_name It will say that the var_name is no assigned.) the line from

variable_value = eval(variable)  # Evaluate to get the symbol or direct value

So to created a temp fix using a struct and context as below to enable !!var_name to work when used with add_interp_param!

            variable_value = haskey(GLOBAL_CONTEXT.variables, variable) ? GLOBAL_CONTEXT.variables[variable] : missing

The problem now comes up in function writing issue where you want to interpolate functions. #20 where macros must be used instead (but that has limits as well ie macro will only put what it sees and can use a variable defined elsewhere.)

I have tried retrofitting the TidierData.jl one with no avail either, but I suspect there is a way.

I plan to continue experimenting (with limited expectation for success) and I am very open for help, as fixing this would really smooth out TidierDBs ability to be part of a pipeline

drizk1 commented 1 month ago

Switching to make the macro logic a function and then interpolating into that has been relatively successful (barring breaking tidy selection which should be fixable.) Source code for @select and @filter are below for convenience. Altho my preference for mutate, filter and summarize is to stay as a macro. so !! interpolation works in

col_names = [:mpg, :vs, :carb]
@chain db_table(con, :mtcars) TidierDB.@select(!!col_names)  @aside @show_query(_)
## debug print messages
b4     (:($(Expr(:escape, :col_names))),)
expr in qoute b4 interp:     (:($(Expr(:escape, :col_names))),)
exprsstring after interp:     [[:mpg, :vs, :carb]]
fxn expr ([:mpg, :vs, :carb],)
## end debug print messages

SELECT mpg, vs, carb
        FROM mtcars

The challenge is now getting it work with @filter @summarize and @mutate. While the above example works, the expression is evaluated before it is passed to the function with the logic so it receives which throws an error. or mixing in column names creates an issue. where ERROR: MethodError: no method matching isless(::Int64, ::Symbol)

conditions = 4
@chain db_table(con, :mtcars) begin
    TidierDB.@filter(!!conditions > 4)
end
## debug print messages
conditions : (:($(Expr(:escape, :conditions)) > 4),)
conditions_expr: Bool[0]
## end debug print messages

once we can get a fix for this, the rest of interpolation should be smooth sailing ( minus some fixable bugs/breaks)

macro select(sqlquery, exprs...)
    exprs = parse_blocks(exprs...)
    exprs = parse_interpolation2.(exprs)
    println("b4     ", exprs)
    return quote
        println("expre:     ", $exprs)
        exprs_str = $(Expr(:vect, map(x -> isa(x, Symbol) ? string(x) : x, exprs)...))
        println("exprsstring:     " ,exprs_str)

        $(esc(sqlquery)) = select_function($(esc(sqlquery)), exprs_str...)
    end
end

function select_function(sqlquery, exprs...)
    columns = parse_tidy_db(exprs, sqlquery.metadata)
    columns_str = join(["SELECT ", join([string(column) for column in columns], ", ")])
    sqlquery.select = columns_str
    sqlquery.metadata.current_selxn .= 0

    for col in columns
        if occursin(".", col)
            table_col_split = split(col, ".")
            table_name, col_name = table_col_split[1], table_col_split[2]

            for idx in eachindex(sqlquery.metadata.current_selxn)
                if sqlquery.metadata.table_name[idx] == table_name && 
                   sqlquery.metadata.name[idx] == col_name
                    sqlquery.metadata.current_selxn[idx] = 2
                end
            end
        else
            matching_indices = findall(sqlquery.metadata.name .== col)
            sqlquery.metadata.current_selxn[matching_indices] .= 1
        end
    end
    return sqlquery
end
macro filter(sqlquery, conditions...)
    # Process the conditions
    conditions = parse_blocks(conditions...)
    conditions = parse_interpolation2.(conditions)

    println("conditions : ", conditions)

    # Construct the final expression to be returned by the macro
    return quote
        sq = $(esc(sqlquery))
        # Ensure the conditions are treated as expressions and then convert to strings
        conditions_expr = map(x -> string(x), $(Expr(:vect, conditions...)))
        println("conditions_expr: ", conditions_expr)
        # Call the filter function with the SQL query and the stringified conditions
        filter_function(sq, conditions_expr...)
    end
end

function filter_function(sqlquery, conditions...)

    if isa(sqlquery, SQLQuery)
        if !sqlquery.is_aggregated
            cte_name = "cte_" * string(sqlquery.cte_count + 1)
            combined_conditions = String[]

            for condition in conditions
                condition_str = string(expr_to_sql(condition, sqlquery))
                condition_str = replace(condition_str, "'\"" => "'", "'\"" => "'", "\"'" => "'", "[" => "(", "]" => ")")
                push!(combined_conditions, condition_str)
            end

            combined_condition_str = join(combined_conditions, " AND ")
            new_cte = CTE(name=cte_name, select="*", from=(isempty(sqlquery.ctes) ? sqlquery.from : last(sqlquery.ctes).name), where=combined_condition_str)
            push!(sqlquery.ctes, new_cte)
            sqlquery.from = cte_name
            sqlquery.cte_count += 1

        else
            aggregated_columns = Set{String}()

            if !isempty(sqlquery.select)
                for part in split(sqlquery.select, ", ")
                    if occursin(" AS ", part)
                        aggregated_column = strip(split(part, " AS ")[2])
                        push!(aggregated_columns, aggregated_column)
                    end
                end
            end

            non_aggregated_conditions = String[]
            groupby_columns = split(replace(sqlquery.groupBy, "GROUP BY " => ""), ", ")
            groupby_columns = strip.(groupby_columns)

            for condition in conditions
                condition_str = string(condition, sqlquery)
                condition_str = replace(condition_str, "'\"" => "'", "\"'" => "'", "[" => "(", "]" => ")")
                condition_involves_aggregated_column = any(col -> occursin(Regex("\\b$col\\b"), condition_str), aggregated_columns)
                sqlquery.where = ""

                if !condition_involves_aggregated_column && any(col -> occursin(Regex("\\b$col\\b"), condition_str), groupby_columns)
                    main_query_having = !isempty(sqlquery.having) ? sqlquery.having * " AND " * condition_str : "HAVING " * condition_str
                    sqlquery.having = main_query_having
                    sqlquery.where = ""  # Clearing sqlquery.where to prevent carrying over conditions

                else
                    push!(non_aggregated_conditions, condition_str)
                end
            end

            if !isempty(non_aggregated_conditions)
                combined_conditions = join(non_aggregated_conditions, " AND ")
                cte_name = "cte_" * string(sqlquery.cte_count + 1)
                new_cte = CTE(name=cte_name, select=sqlquery.select, from=(isempty(sqlquery.ctes) ? sqlquery.from : last(sqlquery.ctes).name), groupBy=sqlquery.groupBy, having=sqlquery.having)
                push!(sqlquery.ctes, new_cte)
                sqlquery.select = "*"
                sqlquery.groupBy = ""
                sqlquery.having = ""

                sqlquery.where = "WHERE " * join(non_aggregated_conditions, " AND ")
                sqlquery.from = cte_name
                sqlquery.cte_count += 1
            end
        end
    else
        error("Expected sqlquery to be an instance of SQLQuery")
    end

    return sqlquery
end
drizk1 commented 1 month ago

Current Obstacle/Impasse:

how to partially evaluate an expression like :(Expr(:escape, :conditions) > 4) so that it only replaces conditions with :mpg, but does not evaluate and throw an error. It needs to not evaluate mpg > 4 so that it can become part of the sql query as a string. This is the challenge for @summarize, @mutate, and @filter.

conditions = :mpg #for ex, but could be string etc
macro test2()
    quote  $(Expr(:escape, (:conditions) > 4 )) end
end
@test2
#ERROR: LoadError: MethodError: no method matching isless(::Int64, ::Symbol)

I have tried