r-rudra / tidycells

Automatic transformation of untidy spreadsheet-like data into tidy form
https://r-rudra.github.io/tidycells/
Other
83 stars 10 forks source link

Explore option to unlink with {unpivotr} #19

Open bedantaguru opened 4 years ago

bedantaguru commented 4 years ago

Try to be independent of {unpivotr}

bedantaguru commented 4 years ago

In recent commit on nightly, I have introduced own direction concept. Here is a quick performance test

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(unpivotr)
# Load some pivoted data

setwd("C:/Users/RBI/Documents/tidycells_nightly")
devtools::load_all(".")
#> Loading tidycells
#> 
#> Attaching package: 'testthat'
#> The following object is masked from 'package:dplyr':
#> 
#>     matches

# copied from enhead help: https://nacnudus.github.io/unpivotr/reference/enhead.html
(x <- purpose$`up-left left-up`)
#>                            X2      X3     X4     X5    X6     X7
#> 1                        <NA>    <NA> Female   <NA>  Male   <NA>
#> 2                        <NA>    <NA>  0 - 6 7 - 10 0 - 6 7 - 10
#> 3           Bachelor's degree 15 - 24   7000  27000  <NA>  13000
#> 4                        <NA> 25 - 44  12000 137000  9000  81000
#> 5                        <NA> 45 - 64  10000  64000  7000  66000
#> 6                        <NA>     65+   <NA>  18000  7000  17000
#> 7                 Certificate 15 - 24  29000 161000 30000 190000
#> 8                        <NA> 25 - 44  34000 179000 31000 219000
#> 9                        <NA> 45 - 64  30000 210000 23000 199000
#> 10                       <NA>     65+  12000  77000  8000 107000
#> 11                    Diploma 15 - 24   <NA>  14000  9000  11000
#> 12                       <NA> 25 - 44  10000  66000  8000  47000
#> 13                       <NA> 45 - 64   6000  68000  5000  58000
#> 14                       <NA>     65+   5000  41000  1000  34000
#> 15           No Qualification 15 - 24  10000  43000 12000  37000
#> 16                       <NA> 25 - 44  11000  36000 21000  50000
#> 17                       <NA> 45 - 64  19000  91000 17000  75000
#> 18                       <NA>     65+  16000 118000  9000  66000
#> 19 Postgraduate qualification 15 - 24   <NA>   6000  <NA>   <NA>
#> 20                       <NA> 25 - 44   5000  86000  7000  60000
#> 21                       <NA> 45 - 64   6000  55000  6000  68000
#> 22                       <NA>     65+   <NA>  13000  <NA>  18000
# Make a tidy representation
cells <- as_cells(x)
cells <- cells[!is.na(cells$chr), ]
head(cells)
#> # A tibble: 6 x 4
#>     row   col data_type chr                       
#>   <int> <int> <chr>     <chr>                     
#> 1     3     1 chr       Bachelor's degree         
#> 2     7     1 chr       Certificate               
#> 3    11     1 chr       Diploma                   
#> 4    15     1 chr       No Qualification          
#> 5    19     1 chr       Postgraduate qualification
#> 6     3     2 chr       15 - 24
# Select the cells containing the values
data_cells <-
  filter(cells, row >= 3, col >= 3) %>%
  transmute(row, col, count = as.integer(chr))
head(data_cells)
#> # A tibble: 6 x 3
#>     row   col count
#>   <int> <int> <int>
#> 1     3     3  7000
#> 2     4     3 12000
#> 3     5     3 10000
#> 4     7     3 29000
#> 5     8     3 34000
#> 6     9     3 30000
# Select the headers
qualification <-
  filter(cells, col == 1) %>%
  select(row, col, qualification = chr)
age <-
  filter(cells, col == 2) %>%
  select(row, col, age = chr)
gender <-
  filter(cells, row == 1) %>%
  select(row, col, gender = chr)
satisfaction <-
  filter(cells, row == 2) %>%
  select(row, col, satisfaction = chr)
# From each data cell, search for the nearest one of each of the headers
data_cells %>%
  enhead(gender, "up-left") %>%
  enhead(satisfaction, "up") %>%
  enhead(qualification, "left-up") %>%
  enhead(age, "left") %>%
  select(-row, -col) ->du

data_cells %>%
  attach_header(gender, "vl") %>%
  attach_header(satisfaction, "v") %>%
  attach_header(qualification, "hu") %>%
  attach_header(age, "h") %>%
  select(-row, -col) ->dt

# own testing function
df_equal(dt, du)
#> [1] TRUE

microbenchmark::microbenchmark(
  data_cells %>%
    enhead(gender, "up-left") %>%
    enhead(satisfaction, "up") %>%
    enhead(qualification, "left-up") %>%
    enhead(age, "left") %>%
    select(-row, -col),
  data_cells %>%
    attach_header(gender, "vl") %>%
    attach_header(satisfaction, "v") %>%
    attach_header(qualification, "hu") %>%
    attach_header(age, "h") %>%
    select(-row, -col)
)
#> Unit: milliseconds
#>                                                                                                                                                                                 expr
#>                data_cells %>% enhead(gender, "up-left") %>% enhead(satisfaction,      "up") %>% enhead(qualification, "left-up") %>% enhead(age,      "left") %>% select(-row, -col)
#>  data_cells %>% attach_header(gender, "vl") %>% attach_header(satisfaction,      "v") %>% attach_header(qualification, "hu") %>% attach_header(age,      "h") %>% select(-row, -col)
#>        min        lq      mean   median        uq      max neval
#>  299.94706 331.06641 363.23814 364.6221 385.96335 517.5638   100
#>   37.02548  43.73709  50.67413  48.1013  55.06245 119.7450   100

Created on 2020-04-01 by the reprex package (v0.3.0)

bedantaguru commented 4 years ago

It looks like it can be easily made optional. Since this is a new deployment I'll keep the option to both {tidycells} and {unpivotr}. This should be configured through options.

Also, as_cell_df now can be made independent. Check this and also refer to https://github.com/r-rudra/tidycells/issues/26