Closed AMCalejandro closed 2 years ago
use group_by(), arrange(desc()) and filter to replace slice_max() method
Example usage:
Input Data
data.frame': 60 obs. of 12 variables: $ seqnames : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ... $ start : int 146492879 146986158 147013182 147101453 146490895 146626685 146630798 146631069 147228332 147229306 ... $ end : int 146596107 146989699 147098017 147131116 146514599 146644123 146634089 146644046 147245484 147232685 ... $ width : int 103229 3542 84836 29664 23705 17439 3292 12978 17153 3380 ... $ strand : Factor w/ 3 levels "+","-","*": 2 2 1 2 2 2 2 2 2 2 ... $ tx_id : chr "ENST00000444082" "ENST00000437831" "ENST00000234739" "ENST00000609196" ... $ tx_biotype : chr "unprocessed_pseudogene" "lincRNA" "protein_coding" "protein_coding" ... $ tx_cds_seq_start: int NA NA 147083636 147102845 NA 146631144 NA 146631144 147230270 147230270 ... $ tx_cds_seq_end : int NA NA 147096760 147131116 NA 146643723 NA 146643644 147231346 147231346 ... $ gene_id : chr "ENSG00000227242" "ENSG00000234610" "ENSG00000116128" "ENSG00000162836" ... $ tx_name : chr "ENST00000444082" "ENST00000437831" "ENST00000234739" "ENST00000609196" ... $ symbol : chr "NBPF13P" "LINC00624" "BCL9" "ACP6" ...
Code
data.frame(txdb_transcripts[S4Vectors::subjectHits(hits),]) %>% ## !!IMPORTANT!! If unique() isn't applied here, ## dplyr will pick duplicates of the same transcript unique() %>% subset((!is.na(symbol)) & (!is.na(tx_name))) %>% group_by(symbol) %>% dplyr::arrange(desc(width)) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% data.table::data.table()
Output
seqnames start end width strand tx_id 1: 1 146492879 146596107 103229 - ENST00000444082 2: 1 147013182 147098017 84836 + ENST00000234739 3: 1 146644765 146724314 79550 + ENST00000606856 4: 1 146646930 146714700 67771 - ENST00000527849 5: 1 146714291 146767441 53151 + ENST00000431239 6: 1 146644951 146683136 38186 + ENST00000607149 7: 1 147101453 147131116 29664 - ENST00000609196 8: 1 146310555 146334210 23656 + ENST00000478377 9: 1 146626685 146644123 17439 - ENST00000254101 10: 1 147228332 147245484 17153 - ENST00000271348 11: 1 147801124 147816764 15641 + ENST00000452996 12: 1 147249700 147261065 11366 + ENST00000426504 13: 1 147374946 147381393 6448 + ENST00000240986 14: 1 147762996 147767966 4971 - ENST00000434245 15: 1 146986158 146989699 3542 - ENST00000437831 16: 1 147767287 147769663 2377 + ENST00000427169 17: 1 146647181 146649319 2139 + ENST00000606152 18: 1 146522379 146524043 1665 + ENST00000435000 19: 1 146674857 146676513 1657 - ENST00000413611 20: 1 147169914 147171461 1548 + ENST00000431525 21: 1 146649692 146651201 1510 + ENST00000471856 22: 1 147313096 147314493 1398 + ENST00000458399 23: 1 146553887 146555127 1241 - ENST00000438469 24: 1 146790828 146791942 1115 + ENST00000435270 25: 1 147229426 147230517 1092 + ENST00000428911 26: 1 146890777 146891610 834 + ENST00000416141 27: 1 146695146 146695940 795 + ENST00000446956 28: 1 146917312 146917866 555 + ENST00000606338 29: 1 146551295 146551419 125 + ENST00000364272 seqnames start end width strand tx_id tx_biotype tx_cds_seq_start tx_cds_seq_end 1: unprocessed_pseudogene NA NA 2: protein_coding 147083636 147096760 3: processed_transcript NA NA 4: nonsense_mediated_decay 146672919 146696621 5: protein_coding 146714354 146767190 6: antisense NA NA 7: protein_coding 147102845 147131116 8: transcribed_unprocessed_pseudogene NA NA 9: protein_coding 146631144 146643723 10: protein_coding 147230270 147231346 11: lincRNA NA NA 12: lincRNA NA NA 13: protein_coding 147380083 147381384 14: lincRNA NA NA 15: lincRNA NA NA 16: lincRNA NA NA 17: antisense NA NA 18: processed_pseudogene NA NA 19: processed_pseudogene NA NA 20: lincRNA NA NA 21: processed_pseudogene NA NA 22: processed_pseudogene NA NA 23: processed_pseudogene NA NA 24: processed_pseudogene NA NA 25: antisense NA NA 26: unitary_pseudogene NA NA 27: processed_pseudogene NA NA 28: unprocessed_pseudogene NA NA 29: snRNA NA NA tx_biotype tx_cds_seq_start tx_cds_seq_end gene_id tx_name symbol 1: ENSG00000227242 ENST00000444082 NBPF13P 2: ENSG00000116128 ENST00000234739 BCL9 3: ENSG00000273071 ENST00000606856 RP11-337C18.10 4: ENSG00000131781 ENST00000527849 FMO5 5: ENSG00000131778 ENST00000431239 CHD1L 6: ENSG00000237188 ENST00000607149 RP11-337C18.8 7: ENSG00000162836 ENST00000609196 ACP6 8: ENSG00000239475 ENST00000478377 HYDIN2 9: ENSG00000131791 ENST00000254101 PRKAB2 10: ENSG00000143140 ENST00000271348 GJA5 11: ENSG00000224481 ENST00000452996 RP11-495P10.3 12: ENSG00000234190 ENST00000426504 RP11-433J22.3 13: ENSG00000121634 ENST00000240986 GJA8 14: ENSG00000231196 ENST00000434245 RP11-495P10.8 15: ENSG00000234610 ENST00000437831 LINC00624 16: ENSG00000238107 ENST00000427169 RP11-495P10.5 17: ENSG00000271721 ENST00000606152 RP11-337C18.9 18: ENSG00000225603 ENST00000435000 RP11-325P15.1 19: ENSG00000226015 ENST00000413611 CCT8P1 20: ENSG00000227139 ENST00000431525 RP11-533N14.3 21: ENSG00000180867 ENST00000471856 PDIA3P1 22: ENSG00000223728 ENST00000458399 RP11-314N2.2 23: ENSG00000230832 ENST00000438469 RP11-325P15.2 24: ENSG00000213226 ENST00000435270 RP11-337C18.4 25: ENSG00000234482 ENST00000428911 RP11-433J22.2 26: ENSG00000226653 ENST00000416141 OR13Z1P 27: ENSG00000236806 ENST00000446956 RP11-337C18.7 28: ENSG00000272443 ENST00000606338 OR13Z2P 29: ENSG00000201142 ENST00000364272 RNVU1-8
Will implement a similar fix in the echoverse branch
echoverse
use group_by(), arrange(desc()) and filter to replace slice_max() method
Example usage:
Input Data
Code
Output