RajLabMSSM / echolocatoR

Automated statistical and functional fine-mapping pipeline with extensive API access to datasets.
https://rajlabmssm.github.io/echolocatoR
MIT License
34 stars 11 forks source link

Replacing slice_max() method #91

Closed AMCalejandro closed 2 years ago

AMCalejandro commented 2 years ago

use group_by(), arrange(desc()) and filter to replace slice_max() method

Example usage:

Input Data

data.frame':   60 obs. of  12 variables:
 $ seqnames        : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
 $ start           : int  146492879 146986158 147013182 147101453 146490895 146626685 146630798 146631069 147228332 147229306 ...
 $ end             : int  146596107 146989699 147098017 147131116 146514599 146644123 146634089 146644046 147245484 147232685 ...
 $ width           : int  103229 3542 84836 29664 23705 17439 3292 12978 17153 3380 ...
 $ strand          : Factor w/ 3 levels "+","-","*": 2 2 1 2 2 2 2 2 2 2 ...
 $ tx_id           : chr  "ENST00000444082" "ENST00000437831" "ENST00000234739" "ENST00000609196" ...
 $ tx_biotype      : chr  "unprocessed_pseudogene" "lincRNA" "protein_coding" "protein_coding" ...
 $ tx_cds_seq_start: int  NA NA 147083636 147102845 NA 146631144 NA 146631144 147230270 147230270 ...
 $ tx_cds_seq_end  : int  NA NA 147096760 147131116 NA 146643723 NA 146643644 147231346 147231346 ...
 $ gene_id         : chr  "ENSG00000227242" "ENSG00000234610" "ENSG00000116128" "ENSG00000162836" ...
 $ tx_name         : chr  "ENST00000444082" "ENST00000437831" "ENST00000234739" "ENST00000609196" ...
 $ symbol          : chr  "NBPF13P" "LINC00624" "BCL9" "ACP6" ...

Code

data.frame(txdb_transcripts[S4Vectors::subjectHits(hits),]) %>%
    ## !!IMPORTANT!! If unique() isn't applied here,
    ## dplyr will pick duplicates of the same transcript
    unique() %>%
    subset((!is.na(symbol)) & (!is.na(tx_name))) %>% 
    group_by(symbol) %>% 
    dplyr::arrange(desc(width)) %>%
    dplyr::filter(row_number()==1) %>% 
    ungroup() %>% 
    data.table::data.table()

Output

    seqnames     start       end  width strand           tx_id
 1:        1 146492879 146596107 103229      - ENST00000444082
 2:        1 147013182 147098017  84836      + ENST00000234739
 3:        1 146644765 146724314  79550      + ENST00000606856
 4:        1 146646930 146714700  67771      - ENST00000527849
 5:        1 146714291 146767441  53151      + ENST00000431239
 6:        1 146644951 146683136  38186      + ENST00000607149
 7:        1 147101453 147131116  29664      - ENST00000609196
 8:        1 146310555 146334210  23656      + ENST00000478377
 9:        1 146626685 146644123  17439      - ENST00000254101
10:        1 147228332 147245484  17153      - ENST00000271348
11:        1 147801124 147816764  15641      + ENST00000452996
12:        1 147249700 147261065  11366      + ENST00000426504
13:        1 147374946 147381393   6448      + ENST00000240986
14:        1 147762996 147767966   4971      - ENST00000434245
15:        1 146986158 146989699   3542      - ENST00000437831
16:        1 147767287 147769663   2377      + ENST00000427169
17:        1 146647181 146649319   2139      + ENST00000606152
18:        1 146522379 146524043   1665      + ENST00000435000
19:        1 146674857 146676513   1657      - ENST00000413611
20:        1 147169914 147171461   1548      + ENST00000431525
21:        1 146649692 146651201   1510      + ENST00000471856
22:        1 147313096 147314493   1398      + ENST00000458399
23:        1 146553887 146555127   1241      - ENST00000438469
24:        1 146790828 146791942   1115      + ENST00000435270
25:        1 147229426 147230517   1092      + ENST00000428911
26:        1 146890777 146891610    834      + ENST00000416141
27:        1 146695146 146695940    795      + ENST00000446956
28:        1 146917312 146917866    555      + ENST00000606338
29:        1 146551295 146551419    125      + ENST00000364272
    seqnames     start       end  width strand           tx_id
                            tx_biotype tx_cds_seq_start tx_cds_seq_end
 1:             unprocessed_pseudogene               NA             NA
 2:                     protein_coding        147083636      147096760
 3:               processed_transcript               NA             NA
 4:            nonsense_mediated_decay        146672919      146696621
 5:                     protein_coding        146714354      146767190
 6:                          antisense               NA             NA
 7:                     protein_coding        147102845      147131116
 8: transcribed_unprocessed_pseudogene               NA             NA
 9:                     protein_coding        146631144      146643723
10:                     protein_coding        147230270      147231346
11:                            lincRNA               NA             NA
12:                            lincRNA               NA             NA
13:                     protein_coding        147380083      147381384
14:                            lincRNA               NA             NA
15:                            lincRNA               NA             NA
16:                            lincRNA               NA             NA
17:                          antisense               NA             NA
18:               processed_pseudogene               NA             NA
19:               processed_pseudogene               NA             NA
20:                            lincRNA               NA             NA
21:               processed_pseudogene               NA             NA
22:               processed_pseudogene               NA             NA
23:               processed_pseudogene               NA             NA
24:               processed_pseudogene               NA             NA
25:                          antisense               NA             NA
26:                 unitary_pseudogene               NA             NA
27:               processed_pseudogene               NA             NA
28:             unprocessed_pseudogene               NA             NA
29:                              snRNA               NA             NA
                            tx_biotype tx_cds_seq_start tx_cds_seq_end
            gene_id         tx_name         symbol
 1: ENSG00000227242 ENST00000444082        NBPF13P
 2: ENSG00000116128 ENST00000234739           BCL9
 3: ENSG00000273071 ENST00000606856 RP11-337C18.10
 4: ENSG00000131781 ENST00000527849           FMO5
 5: ENSG00000131778 ENST00000431239          CHD1L
 6: ENSG00000237188 ENST00000607149  RP11-337C18.8
 7: ENSG00000162836 ENST00000609196           ACP6
 8: ENSG00000239475 ENST00000478377         HYDIN2
 9: ENSG00000131791 ENST00000254101         PRKAB2
10: ENSG00000143140 ENST00000271348           GJA5
11: ENSG00000224481 ENST00000452996  RP11-495P10.3
12: ENSG00000234190 ENST00000426504  RP11-433J22.3
13: ENSG00000121634 ENST00000240986           GJA8
14: ENSG00000231196 ENST00000434245  RP11-495P10.8
15: ENSG00000234610 ENST00000437831      LINC00624
16: ENSG00000238107 ENST00000427169  RP11-495P10.5
17: ENSG00000271721 ENST00000606152  RP11-337C18.9
18: ENSG00000225603 ENST00000435000  RP11-325P15.1
19: ENSG00000226015 ENST00000413611         CCT8P1
20: ENSG00000227139 ENST00000431525  RP11-533N14.3
21: ENSG00000180867 ENST00000471856        PDIA3P1
22: ENSG00000223728 ENST00000458399   RP11-314N2.2
23: ENSG00000230832 ENST00000438469  RP11-325P15.2
24: ENSG00000213226 ENST00000435270  RP11-337C18.4
25: ENSG00000234482 ENST00000428911  RP11-433J22.2
26: ENSG00000226653 ENST00000416141        OR13Z1P
27: ENSG00000236806 ENST00000446956  RP11-337C18.7
28: ENSG00000272443 ENST00000606338        OR13Z2P
29: ENSG00000201142 ENST00000364272        RNVU1-8
bschilder commented 2 years ago

Will implement a similar fix in the echoverse branch