Open bedantaguru opened 4 years ago
In recent commit on nightly, I have introduced own direction concept. Here is a quick performance test
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(unpivotr)
# Load some pivoted data
setwd("C:/Users/RBI/Documents/tidycells_nightly")
devtools::load_all(".")
#> Loading tidycells
#>
#> Attaching package: 'testthat'
#> The following object is masked from 'package:dplyr':
#>
#> matches
# copied from enhead help: https://nacnudus.github.io/unpivotr/reference/enhead.html
(x <- purpose$`up-left left-up`)
#> X2 X3 X4 X5 X6 X7
#> 1 <NA> <NA> Female <NA> Male <NA>
#> 2 <NA> <NA> 0 - 6 7 - 10 0 - 6 7 - 10
#> 3 Bachelor's degree 15 - 24 7000 27000 <NA> 13000
#> 4 <NA> 25 - 44 12000 137000 9000 81000
#> 5 <NA> 45 - 64 10000 64000 7000 66000
#> 6 <NA> 65+ <NA> 18000 7000 17000
#> 7 Certificate 15 - 24 29000 161000 30000 190000
#> 8 <NA> 25 - 44 34000 179000 31000 219000
#> 9 <NA> 45 - 64 30000 210000 23000 199000
#> 10 <NA> 65+ 12000 77000 8000 107000
#> 11 Diploma 15 - 24 <NA> 14000 9000 11000
#> 12 <NA> 25 - 44 10000 66000 8000 47000
#> 13 <NA> 45 - 64 6000 68000 5000 58000
#> 14 <NA> 65+ 5000 41000 1000 34000
#> 15 No Qualification 15 - 24 10000 43000 12000 37000
#> 16 <NA> 25 - 44 11000 36000 21000 50000
#> 17 <NA> 45 - 64 19000 91000 17000 75000
#> 18 <NA> 65+ 16000 118000 9000 66000
#> 19 Postgraduate qualification 15 - 24 <NA> 6000 <NA> <NA>
#> 20 <NA> 25 - 44 5000 86000 7000 60000
#> 21 <NA> 45 - 64 6000 55000 6000 68000
#> 22 <NA> 65+ <NA> 13000 <NA> 18000
# Make a tidy representation
cells <- as_cells(x)
cells <- cells[!is.na(cells$chr), ]
head(cells)
#> # A tibble: 6 x 4
#> row col data_type chr
#> <int> <int> <chr> <chr>
#> 1 3 1 chr Bachelor's degree
#> 2 7 1 chr Certificate
#> 3 11 1 chr Diploma
#> 4 15 1 chr No Qualification
#> 5 19 1 chr Postgraduate qualification
#> 6 3 2 chr 15 - 24
# Select the cells containing the values
data_cells <-
filter(cells, row >= 3, col >= 3) %>%
transmute(row, col, count = as.integer(chr))
head(data_cells)
#> # A tibble: 6 x 3
#> row col count
#> <int> <int> <int>
#> 1 3 3 7000
#> 2 4 3 12000
#> 3 5 3 10000
#> 4 7 3 29000
#> 5 8 3 34000
#> 6 9 3 30000
# Select the headers
qualification <-
filter(cells, col == 1) %>%
select(row, col, qualification = chr)
age <-
filter(cells, col == 2) %>%
select(row, col, age = chr)
gender <-
filter(cells, row == 1) %>%
select(row, col, gender = chr)
satisfaction <-
filter(cells, row == 2) %>%
select(row, col, satisfaction = chr)
# From each data cell, search for the nearest one of each of the headers
data_cells %>%
enhead(gender, "up-left") %>%
enhead(satisfaction, "up") %>%
enhead(qualification, "left-up") %>%
enhead(age, "left") %>%
select(-row, -col) ->du
data_cells %>%
attach_header(gender, "vl") %>%
attach_header(satisfaction, "v") %>%
attach_header(qualification, "hu") %>%
attach_header(age, "h") %>%
select(-row, -col) ->dt
# own testing function
df_equal(dt, du)
#> [1] TRUE
microbenchmark::microbenchmark(
data_cells %>%
enhead(gender, "up-left") %>%
enhead(satisfaction, "up") %>%
enhead(qualification, "left-up") %>%
enhead(age, "left") %>%
select(-row, -col),
data_cells %>%
attach_header(gender, "vl") %>%
attach_header(satisfaction, "v") %>%
attach_header(qualification, "hu") %>%
attach_header(age, "h") %>%
select(-row, -col)
)
#> Unit: milliseconds
#> expr
#> data_cells %>% enhead(gender, "up-left") %>% enhead(satisfaction, "up") %>% enhead(qualification, "left-up") %>% enhead(age, "left") %>% select(-row, -col)
#> data_cells %>% attach_header(gender, "vl") %>% attach_header(satisfaction, "v") %>% attach_header(qualification, "hu") %>% attach_header(age, "h") %>% select(-row, -col)
#> min lq mean median uq max neval
#> 299.94706 331.06641 363.23814 364.6221 385.96335 517.5638 100
#> 37.02548 43.73709 50.67413 48.1013 55.06245 119.7450 100
Created on 2020-04-01 by the reprex package (v0.3.0)
It looks like it can be easily made optional. Since this is a new deployment I'll keep the option to both {tidycells} and {unpivotr}. This should be configured through options.
Also, as_cell_df
now can be made independent. Check this and also refer to https://github.com/r-rudra/tidycells/issues/26
Try to be independent of {unpivotr}