# some ids have single date entries
data = DataFrame(
"unique_id" => [i for i in 1:1500000],
"datestamp" => [Date("2021-08-16") for i in 1:1500000],
)
# some ids have 2 date entries
data = vcat(data, DataFrame(
"unique_id" => repeat(1500001:4500000, 2),
"datestamp" => vcat(repeat([Date("2021-08-16")],3000000), repeat([Date("2021-08-22")],3000000)),
))
# have lot of columns that needs to be processed
real_number_columns = [string(c)*"_"*string(i) for c in 'a':'z' for i in 1:4]
for c in columns
data[!, c] = rand(7500000)
end
# some dimension
data[!, "dimension"] = rand([string(c) for c in 'A':'Z'],7500000)
size(data)
This will give us (7500000, 107) dataset
Looking for processing this large dataset faster however possible. currently it takes around 40 mins without any parallel processing. The resulting data-frame will have
unique_id
real_number_columns with suffix "_DIFF" when unique_id contains 2 datestamps (diffs of values) or else missing
real_number_columns with suffix "_RECENT" when unique_id contains 1 or 2 datestamp (just the recent value)
just the most recent dimension
# get the difference between real_number_columns across dates if 2 dates exist
df = @chain data begin
transform!(:, :datestamp => ByRow(d -> d == Date("2021-08-22") ? true : false) => :recent_date)
groupby(["unique_id"])
combine(df1 -> begin
df1_max = nothing
df1_min = nothing
try
d = Dict()
if nrow(df1) == 1
d["day_diff"] = [0]
df1_max = df1[1,:]
df1_min = df1[1,:]
else
d["day_diff"] = [6]
df1_max = df1[findfirst(df1.recent_date),:]
df1_min = df1[findfirst(.!df1.recent_date),:]
end
d["dimension"] = [df1_max.dimension]
for m in real_number_columns
d[m*"_RECENT"] = [df1_max[m]]
if nrow(df1) > 1
d[m*"_DIFF"] = [df1_max[m] - df1_min[m]]
else
d[m*"_DIFF"] = [missing]
end
end
DataFrame(d)
catch e
@error "Something went wrong" exception=(e, catch_backtrace())
rethrow()
end
end)
end
Questions
how can one make this processing efficient ?
Can we get faster processing time than efficiency (say on device with RAM around 1 TB) ?
Here is a sample dataset with 7 million rows
This will give us
(7500000, 107)
datasetLooking for processing this large dataset faster however possible. currently it takes around 40 mins without any parallel processing. The resulting data-frame will have
real_number_columns
with suffix "_DIFF" when unique_id contains 2 datestamps (diffs of values) or else missingreal_number_columns
with suffix "_RECENT" when unique_id contains 1 or 2 datestamp (just the recent value)Questions