TidierOrg / TidierData.jl

Tidier data transformations in Julia, modeled after the dplyr/tidyr R packages.
MIT License
86 stars 7 forks source link

error when one case has multiple conditions and column has missing #118

Closed drizk1 closed 1 month ago

drizk1 commented 1 month ago

This might be how case_when in df.jl works but I am experiencing the following error

df = DataFrame(test = [1, 4, 3, missing, 4, 5, 6, 7]);

when a compound condition is used and there are missing values the following error is returned. Wrapping the case_when in if_else also works

@chain df begin
     #@filter(!ismissing(test))
     @mutate(test2 = case_when(
         ismissing(test) => test , 
         test > 1 && test < 4 => "ok" , 
         test ==7 => "seven",
         true => "true"
     ))
 end
ERROR: TypeError: non-boolean (Missing) used in boolean context
Stacktrace:
  [1] (::Base.Broadcast.var"#3#4"{Base.Broadcast.Broadcasted{…}})(::Missing, ::Missing, ::Vararg{Any})
    @ Base.Broadcast ./broadcast.jl:199
  [2] _broadcast_getindex_evalf
    @ ./broadcast.jl:709 [inlined]
  [3] _broadcast_getindex
### Full error is posted below.

@chain df begin
    @mutate(test2 = if_else(ismissing(test), test, case_when(
        ismissing(test) => test, 
        (test > 1) & (test < 4) => "ok", 
        test == 7 => "seven",
        true => "true"
    )))
end

7×2 DataFrame
 Row │ test     test2   
     │ Int64?   String? 
─────┼──────────────────
   1 │       1  true
   2 │       2  ok
   3 │       3  ok
   4 │ missing  missing 
   5 │       5  true
   6 │       6  true
   7 │       7  seven

When you filter out the missing values it runs or when you remove the compound condition case

@chain df begin
     @filter(!ismissing(test))
     @mutate(test2 = case_when(
         ismissing(test) => test , 
         test > 1 && test < 4 => "ok" , 
         test ==7 => "seven",
         true => "true"
     ))
 end
7×2 DataFrame
 Row │ test     test2   
     │ Int64?   String  
─────┼──────────────────
   1 │       1  true
   2 │       2  ok
   3 │       3  ok
   4 │ missing  MISSING
   5 │       5  ok
   6 │       6  ok
   7 │       7  ok

 @chain df begin
    #@filter(!ismissing(test))
    @mutate(test2 = case_when(
        ismissing(test) => test , 
      #  test > 1 && test < 4 => "ok" , 
        test ==7 => "seven",
        true => "true"
    ))
end

7×2 DataFrame
 Row │ test     test2   
     │ Int64?   String? 
─────┼──────────────────
   1 │       1  true
   2 │       2  true
   3 │       3  true
   4 │ missing  missing 
   5 │       5  true
   6 │       6  true
   7 │       7  seven
ERROR: TypeError: non-boolean (Missing) used in boolean context
Stacktrace:
  [1] (::Base.Broadcast.var"#3#4"{Base.Broadcast.Broadcasted{…}})(::Missing, ::Missing, ::Vararg{Any})
    @ Base.Broadcast ./broadcast.jl:199
  [2] _broadcast_getindex_evalf
    @ ./broadcast.jl:709 [inlined]
  [3] _broadcast_getindex
    @ ./broadcast.jl:682 [inlined]
  [4] _getindex
    @ ./broadcast.jl:705 [inlined]
  [5] _broadcast_getindex
    @ ./broadcast.jl:681 [inlined]
  [6] _getindex (repeats 2 times)
    @ ./broadcast.jl:705 [inlined]
  [7] _broadcast_getindex
    @ ./broadcast.jl:681 [inlined]
  [8] getindex
    @ ./broadcast.jl:636 [inlined]
  [9] copyto_nonleaf!(dest::Vector{…}, bc::Base.Broadcast.Broadcasted{…}, iter::Base.OneTo{…}, state::Int64, count::Int64)
    @ Base.Broadcast ./broadcast.jl:1098
 [10] copy
    @ ./broadcast.jl:950 [inlined]
 [11] materialize
    @ ./broadcast.jl:903 [inlined]
 [12] (::var"#1051#1053")(test::Vector{Union{Missing, Int64}})
    @ Main ~/.julia/packages/TidierData/C04bk/src/parsing.jl:162
 [13] _transformation_helper(df::DataFrame, col_idx::Int64, ::Base.RefValue{Any})
    @ DataFrames ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:562
 [14] select_transform!(::Base.RefValue{…}, df::DataFrame, newdf::DataFrame, transformed_cols::Set{…}, copycols::Bool, allow_resizing_newdf::Base.RefValue{…}, column_to_copy::BitVector)
    @ DataFrames ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:805
 [15] _manipulate(df::DataFrame, normalized_cs::Vector{Any}, copycols::Bool, keeprows::Bool)
    @ DataFrames ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:1778
 [16] manipulate(::DataFrame, ::Any, ::Vararg{Any}; copycols::Bool, keeprows::Bool, renamecols::Bool)
    @ DataFrames ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:1698
 [17] select(::DataFrame, ::Any, ::Vararg{Any}; copycols::Bool, renamecols::Bool, threads::Bool)
    @ DataFrames ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:1298
 [18] select
    @ ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:1298 [inlined]
 [19] transform(df::DataFrame, args::Any)
    @ DataFrames ~/.julia/packages/DataFrames/58MUJ/src/abstractdataframe/selection.jl:1378
 [20] macro expansion
    @ ~/.julia/packages/TidierData/C04bk/src/TidierData.jl:293 [inlined]
 [21] top-level scope
    @ ~/jl_stuff/chp9.jl:339
Some type information was truncated. Use `show(err)` to see complete types.
drizk1 commented 1 month ago

ah yes. double vs single ampersans and wrapping the conditions in ()

@chain df begin
    @mutate(test2 =  case_when(
        ismissing(test) => test, 
        (test > 1) & (test < 4) => "ok", 
        test == 7 => "seven",
        true => "true"
    ))
end
7×2 DataFrame
 Row │ test     test2   
     │ Int64?   String? 
─────┼──────────────────
   1 │       1  true
   2 │       2  ok
   3 │       3  ok
   4 │ missing  missing 
   5 │       5  true
   6 │       6  true
   7 │       7  seven

we might consider putting an example in the docstring and docs on how to do this