morinlab / GAMBLR

Set of standardized functions to operate with genomic data
https://morinlab.github.io/GAMBLR/
MIT License
3 stars 2 forks source link

Fix column names of get_cn_states output #214

Closed vladimirsouza closed 1 year ago

vladimirsouza commented 1 year ago

This is a very simple fix. Please check the file change.

I think the purpose of the last lines of get_cn_states would be to sort the columns of the output, but actually they just incorrectly change the column names (gene IDs). Below I show some examples to replicate this error.

The output of code 1 and code 2 should be the same:

these_sample_ids <- c(
  "BLGSP-71-06-00160-01A-03D",
  "BLGSP-71-06-00252-01A-01D",
  "BLGSP-71-19-00122-09A.1-01D",
  "BLGSP-71-19-00523-09A-01D",
  "BLGSP-71-21-00187-01A-01E",
  "BLGSP-71-21-00188-01A-04E"
)
my_meta <- get_gambl_metadata() %>%
  filter(sample_id %in% these_sample_ids)

### code 1: get cn states from all genes, then subset "BCL2"
cn_state_all_genes = get_cn_states(
  regions_bed = grch37_lymphoma_genes_bed,
  these_samples_metadata = my_meta,
  region_names = grch37_lymphoma_genes_bed$hgnc_symbol
)
select(cn_state_all_genes, BCL2)
#                             BCL2
# BLGSP-71-06-00160-01A-03D      2
# BLGSP-71-06-00252-01A-01D      4
# BLGSP-71-19-00122-09A.1-01D    4
# BLGSP-71-19-00523-09A-01D      4
# BLGSP-71-21-00187-01A-01E      1
# BLGSP-71-21-00188-01A-04E      4

### code 2: get cn states directly only from "BCL2"
cn_state_all_genes1 = get_cn_states(
  regions_bed = filter(grch37_lymphoma_genes_bed, hgnc_symbol == "BCL2"),
  these_samples_metadata = my_meta,
  region_names = "BCL2"
)
cn_state_all_genes1
#                             BCL2
# BLGSP-71-06-00160-01A-03D      2
# BLGSP-71-06-00252-01A-01D      2
# BLGSP-71-19-00122-09A.1-01D    2
# BLGSP-71-19-00523-09A-01D      2
# BLGSP-71-21-00187-01A-01E      3
# BLGSP-71-21-00188-01A-04E      4

In the next example, I just inverted the row order of the regions_bed argument input (regions_bed1 and regions_bed2 dataframes). As you can see, the elements of the output dataframe are the same, while only the column names are inverted. Both get_cn_states calls should return the same output.

(regions_bed1 <- grch37_lymphoma_genes_bed[c(81, 180),])
# # A tibble: 2 × 4
#   chromosome_name start_position end_position hgnc_symbol
#   <chr>                    <dbl>        <dbl> <chr>      
# 1 18                    60790579     60987361 BCL2       
# 2 8                    128747680    128753674 MYC        

(regions_bed2 <- grch37_lymphoma_genes_bed[c(180, 81),])
# # A tibble: 2 × 4
#   chromosome_name start_position end_position hgnc_symbol
#   <chr>                    <dbl>        <dbl> <chr>      
# 1 8                    128747680    128753674 MYC        
# 2 18                    60790579     60987361 BCL2       

get_cn_states(regions_bed = regions_bed1,
              region_names = regions_bed1$hgnc_symbol,
              these_samples_metadata = my_meta)
#                             BCL2 MYC                                                                                                   
# BLGSP-71-06-00160-01A-03D      2   3
# BLGSP-71-06-00252-01A-01D      2   4
# BLGSP-71-19-00122-09A.1-01D    2   4
# BLGSP-71-19-00523-09A-01D      2   4
# BLGSP-71-21-00187-01A-01E      3   5
# BLGSP-71-21-00188-01A-04E      4   6

get_cn_states(regions_bed = regions_bed2,
              region_names = regions_bed2$hgnc_symbol,
              these_samples_metadata = my_meta)
#                             MYC BCL2                                                                                                   
# BLGSP-71-06-00160-01A-03D     2    3
# BLGSP-71-06-00252-01A-01D     2    4
# BLGSP-71-19-00122-09A.1-01D   2    4
# BLGSP-71-19-00523-09A-01D     2    4
# BLGSP-71-21-00187-01A-01E     3    5
# BLGSP-71-21-00188-01A-04E     4    6

Pull Request Checklists

Checklist for all PRs

Required

The same output here:

### code 1: get cn states from all genes, then subset "BCL2"
cn_state_all_genes = get_cn_states(
    regions_bed = grch37_lymphoma_genes_bed,
    these_samples_metadata = my_meta,
    region_names = grch37_lymphoma_genes_bed$hgnc_symbol
)
select(cn_state_all_genes, BCL2)                                                                                                     
#                             BCL2
# BLGSP-71-06-00160-01A-03D      2
# BLGSP-71-06-00252-01A-01D      2
# BLGSP-71-19-00122-09A.1-01D    2
# BLGSP-71-19-00523-09A-01D      2
# BLGSP-71-21-00187-01A-01E      3
# BLGSP-71-21-00188-01A-04E      4

#### code 2: get cn states directly only from "BCL2"
cn_state_all_genes1 = get_cn_states(
    regions_bed = filter(grch37_lymphoma_genes_bed, hgnc_symbol == "BCL2"),
    these_samples_metadata = my_meta,
    region_names = "BCL2"
)
cn_state_all_genes1
#                             BCL2
# BLGSP-71-06-00160-01A-03D      2
# BLGSP-71-06-00252-01A-01D      2
# BLGSP-71-19-00122-09A.1-01D    2
# BLGSP-71-19-00523-09A-01D      2
# BLGSP-71-21-00187-01A-01E      3
# BLGSP-71-21-00188-01A-04E      4

And the same output here:

get_cn_states(regions_bed = regions_bed1,
              region_names = regions_bed1$hgnc_symbol,
              these_samples_metadata = my_meta)
#                             BCL2 MYC                                                                                                   
# BLGSP-71-06-00160-01A-03D      2   3
# BLGSP-71-06-00252-01A-01D      2   4
# BLGSP-71-19-00122-09A.1-01D    2   4
# BLGSP-71-19-00523-09A-01D      2   4
# BLGSP-71-21-00187-01A-01E      3   5
# BLGSP-71-21-00188-01A-04E      4   6

get_cn_states(regions_bed = regions_bed2,
              region_names = regions_bed2$hgnc_symbol,
              these_samples_metadata = my_meta)
#                             MYC BCL2                                                                                                   
# BLGSP-71-06-00160-01A-03D     3    2
# BLGSP-71-06-00252-01A-01D     4    2
# BLGSP-71-19-00122-09A.1-01D   4    2
# BLGSP-71-19-00523-09A-01D     4    2
# BLGSP-71-21-00187-01A-01E     5    3
# BLGSP-71-21-00188-01A-04E     6    4