addCellMetadata performs an unexpected sort

Describe the Error

For some reason, with my CosMx data the addCellMetadata command triggers a sort of the cell rows in the giotto object (it doesn't do this with the starmap mini, vizium mini, or cosmx mini datasets).

It looks like under the hood it is joined via a data.table merge, which has the default setting sort=T, but usually follows the ordering in by.x cell_ID if all.x=TRUE.

The reordering appears to be okay, as the giotto object also shifts the ordering of the column names in the expression matrix to match, but if someone is adding successive cell metadata without a cell_ID column as in method 1, the row ordering will now be different.

I can share the giotto object link to the compressed folder, but it is over 100MB with just two fovs.

...

To Reproduce

library(Giotto)

fov_join = loadGiotto(path_to_folder = "s24_giotto_obj")

fovs = list("PS19-V"=c(11, 12))
labs = data.table(stack(fovs))
labs$values = as.character(labs$values)
labs

# get the cell metadata for this feature type
meta = getCellMetadata(fov_join, output="data.table")
meta[, fov := gsub("fov0", "", list_ID)]
print("before merging...")
meta
             cell_ID list_ID fov
  1:   fov011-cell_1  fov011  11
  2:   fov011-cell_2  fov011  11
  3:   fov011-cell_3  fov011  11
  4:   fov011-cell_4  fov011  11
  5:   fov011-cell_5  fov011  11
 ---
626: fov012-cell_293  fov012  12
627: fov012-cell_294  fov012  12
628: fov012-cell_295  fov012  12
629: fov012-cell_296  fov012  12
630: fov012-cell_297  fov012  12

meta = merge(meta, labs, by.x="fov", by.y="values", all.x=T, sort=F)
setnames(meta, old="ind", new="group")
meta[, c("genotype", "treatment") := tstrsplit(group, "-")]
setcolorder(meta, c("cell_ID", names(meta)[names(meta) != "cell_ID"]))
print("after merging...")
meta
             cell_ID fov list_ID  group genotype treatment
  1:   fov011-cell_1  11  fov011 PS19-V     PS19         V
  2:   fov011-cell_2  11  fov011 PS19-V     PS19         V
  3:   fov011-cell_3  11  fov011 PS19-V     PS19         V
  4:   fov011-cell_4  11  fov011 PS19-V     PS19         V
  5:   fov011-cell_5  11  fov011 PS19-V     PS19         V
 ---
626: fov012-cell_293  12  fov012 PS19-V     PS19         V
627: fov012-cell_294  12  fov012 PS19-V     PS19         V
628: fov012-cell_295  12  fov012 PS19-V     PS19         V
629: fov012-cell_296  12  fov012 PS19-V     PS19         V
630: fov012-cell_297  12  fov012 PS19-V     PS19         V

# add it back in
fov_join2 = addCellMetadata(gobject=fov_join, new_metadata=meta, by_column=T, column_cell_ID="cell_ID")
> pDataDT(fov_join2)
             cell_ID fov list_ID  group genotype treatment
  1:   fov011-cell_1  11  fov011 PS19-V     PS19         V
  2:  fov011-cell_10  11  fov011 PS19-V     PS19         V
  3: fov011-cell_100  11  fov011 PS19-V     PS19         V
  4: fov011-cell_101  11  fov011 PS19-V     PS19         V
  5: fov011-cell_102  11  fov011 PS19-V     PS19         V
 ---
626:  fov012-cell_95  12  fov012 PS19-V     PS19         V
627:  fov012-cell_96  12  fov012 PS19-V     PS19         V
628:  fov012-cell_97  12  fov012 PS19-V     PS19         V
629:  fov012-cell_98  12  fov012 PS19-V     PS19         V
630:  fov012-cell_99  12  fov012 PS19-V     PS19         V

Expected behavior

Rows in the same order as with the cosmx mini dataset

library(Giotto)
library(GiottoData)

star = loadGiottoMini(dataset="cosmx")
meta = getCellMetadata(star, output="data.table")
meta
            cell_ID list_ID
  1:  fov002-cell_1  fov002
  2:  fov002-cell_2  fov002
  3:  fov002-cell_3  fov002
  4:  fov002-cell_4  fov002
  5:  fov002-cell_5  fov002
 ---
115: fov003-cell_48  fov003
116: fov003-cell_49  fov003
117: fov003-cell_50  fov003
118: fov003-cell_51  fov003
119: fov003-cell_52  fov003

meta[, fov := gsub("fov0", "", list_ID)]
fovs = list("PS19-V"=c("02", "03"))
labs = data.table(stack(fovs))
labs$values = as.character(labs$values)
labs
   values    ind
1:     02 PS19-V
2:     03 PS19-V

meta = getCellMetadata(star, output="data.table")
meta[, fov := gsub("fov0", "", list_ID)]
meta = merge(meta, labs, by.x="fov", by.y="values", all.x=T, sort=F)
setnames(meta, old="ind", new="group")
meta[, c("genotype", "treatment") := tstrsplit(group, "-")]
setcolorder(meta, c("cell_ID", names(meta)[names(meta) != "cell_ID"]))
meta
            cell_ID fov list_ID  group genotype treatment
  1:  fov002-cell_1  02  fov002 PS19-V     PS19         V
  2:  fov002-cell_2  02  fov002 PS19-V     PS19         V
  3:  fov002-cell_3  02  fov002 PS19-V     PS19         V
  4:  fov002-cell_4  02  fov002 PS19-V     PS19         V
  5:  fov002-cell_5  02  fov002 PS19-V     PS19         V
 ---
115: fov003-cell_48  03  fov003 PS19-V     PS19         V
116: fov003-cell_49  03  fov003 PS19-V     PS19         V
117: fov003-cell_50  03  fov003 PS19-V     PS19         V
118: fov003-cell_51  03  fov003 PS19-V     PS19         V
119: fov003-cell_52  03  fov003 PS19-V     PS19         V

star = addCellMetadata(gobject=star, new_metadata=meta, by_column=T)

These column names were already used: list_ID
 and will be overwritten
pDataDT(star)
            cell_ID fov list_ID  group genotype treatment
  1:  fov002-cell_1  02  fov002 PS19-V     PS19         V
  2:  fov002-cell_2  02  fov002 PS19-V     PS19         V
  3:  fov002-cell_3  02  fov002 PS19-V     PS19         V
  4:  fov002-cell_4  02  fov002 PS19-V     PS19         V
  5:  fov002-cell_5  02  fov002 PS19-V     PS19         V
 ---
115: fov003-cell_48  03  fov003 PS19-V     PS19         V
116: fov003-cell_49  03  fov003 PS19-V     PS19         V
117: fov003-cell_50  03  fov003 PS19-V     PS19         V
118: fov003-cell_51  03  fov003 PS19-V     PS19         V
119: fov003-cell_52  03  fov003 PS19-V     PS19         V

System Information

> sessionInfo()
R version 4.2.2 (2022-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/rrbutler/.local/share/r-miniconda/envs/giotto_env/lib/libopenblasp-r0.3.25.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
 [9] LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] GiottoData_0.2.6.2  Giotto_4.0.0        GiottoVisuals_0.1.0
[4] GiottoClass_0.1.0   GiottoUtils_0.1.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.11         pillar_1.9.0        compiler_4.2.2
 [4] XVector_0.38.0      tools_4.2.2         zlibbioc_1.44.0
 [7] jsonlite_1.8.8      lifecycle_1.0.4     tibble_3.2.1
[10] gtable_0.3.4        checkmate_2.3.0     lattice_0.22-5
[13] png_0.1-8           pkgconfig_2.0.3     rlang_1.1.2
[16] igraph_1.5.1        Matrix_1.6-1.1      cli_3.6.1
[19] parallel_4.2.2      terra_1.7-55        withr_2.5.2
[22] dplyr_1.1.4         colorRamp2_0.1.0    rappdirs_0.3.3
[25] generics_0.1.3      vctrs_0.6.5         S4Vectors_0.36.2
[28] IRanges_2.32.0      stats4_4.2.2        grid_4.2.2
[31] tidyselect_1.2.0    reticulate_1.34.0   Biobase_2.58.0
[34] glue_1.6.2          data.table_1.14.8   R6_2.5.1
[37] fansi_1.0.5         ggplot2_3.4.4       magrittr_2.0.3
[40] backports_1.4.1     scales_1.3.0        codetools_0.2-19
[43] BiocGenerics_0.44.0 colorspace_2.1-0    utf8_1.2.4
[46] munsell_0.5.0

Bit of a follow-up, prior to adding the metadata, I aggregated the expression matrix for my data as instructed in the Lung CosMx tutorial, and the columns of the matrix are already sorted:

└──Spatial unit "[34mcell[39m"
   ├──Feature type "[31mrna[39m"
   │  └──Expression data "[36mraw[39m" values:
   │        An object of class exprObj : "raw"
   │        spat_unit : "cell"
   │        feat_type : "rna"
   │        provenance: cell 
   │        
   │        contains:
   │        950 x 8557 sparse Matrix of class "dgCMatrix"
   │                                                      
   │        6330403K07Rik . 1 . . . . . . . . 2 . . ......
   │        Abca2         . . . . 1 . . . . . . . . ......
   │        Abi1          . . . . 1 . . . . . 1 . 2 ......
   │        
   │         ........suppressing 8544 columns and 944 rows in show(); maybe adjust options(max.print=, width=)
   │                                                 
   │        Zeb1    . . . . . . . . . .  1 . . ......
   │        Zfyve16 . . . . . . . . . .  2 . . ......
   │        Zwint   . . 1 1 . . . . . . 11 . . ......
   │        
   │         First four colnames:
   │         fov011-cell_1 fov011-cell_10
   │         fov011-cell_100 fov011-cell_101 
   │     
   └──Feature type "[31mneg_probe[39m"
      └──Expression data "[36mraw[39m" values:
            An object of class exprObj : "raw"
            spat_unit : "cell"
            feat_type : "neg_probe"
            provenance: cell 

            contains:
            10 x 8557 sparse Matrix of class "dgCMatrix"

            NegPrb1  . . . . . . . . . . . . . ......
            NegPrb10 . . . . . . . . . . . . . ......
            NegPrb2  . . . . . . . . . . . . . ......

             ........suppressing 8544 columns and 4 rows in show(); maybe adjust options(max.print=, width=)

            NegPrb7 . . . . . . . . . . . . . ......
            NegPrb8 . . . . . . . . . . . . . ......
            NegPrb9 . . . . . . . . . . . . . ......

             First four colnames:
             fov011-cell_1 fov011-cell_10
             fov011-cell_100 fov011-cell_101

If I grab the cosmx mini dataset, the expression matrix columns there are not sorted, so it may be down to the overlapToMatrix ?

Hi @rbutleriii

Thank you for reporting and looking into this issue in detail!

From what I have tracked down, the initial sorting happens after a call to data.table::dcast() (location) when converting from the table of overlapped features to a matrix. There is an additional sort() that does not do much, but happens in the giotto method for overlapToMatrix() right before adding the matrix to the giotto object. (location)

These are likely causing the issue by propagating the ordering change into the metadata after addCellMetadata()

A sort() during overlapToMatrix() was intended because of the way that aggregating works, where the results are ordered based on which feature points in the giottoPolygon are first overlapped, leading to row and col orders that can be expected to look random. There is no template for ordering to follow when the expression matrix is generated de novo, so sorting was the next best option. Normally the metadata is created by a call to initialize() based on the paired expression information when the matrix is first added to the giotto object, but something in the convenience function may have delayed the sorting update until you called addCellMetadata().

Assuming the above is the issue: An immediate solution would be to manually apply the desired col and row order from the metadata to the matching expression matrix and then run initialize() on the giotto object.

The actual fix, could be implementing a check to match the expression row and col order to the expression information if they differ and/or adding a replacement method for spatIDs() to intentionally apply it across the object. Or maybe just make it so that the ordering of metadata and expression are independent. Any suggestions would be welcome, and thank you again for catching this.

Best, George

Hmm, it doesn't particularly make it problematic to have it sorted. I would almost always prefer to add metadata with a cell_ID key column as it is more explicit. The only other option that comes to mind would be for it to by default do a natural sort via mixedsort(), since the cell_ID's will almost certainly be some combination of fov123-cell456.

That is a good idea, thank you.

I am pushing GiottoClass v0.1.3 and GiottoUtils v0.1.3 that should have the following fixes:

natural sorts with gtools::mixedsort() are now used for cell_ID and feat_ID sorts in GiottoClass.
addCellMetadata() and addFeatMetadata() now check for the starting ID ordering so no more unexpected sorts should happen.

Importantly, this also means that the ordering of metadata may be different from the col/row ordering of the expression matrix (and the outputs from the giotto object method for spatIDs() and featIDs() that indirectly pull from the expression matrix).

Our current plan moving forward with spatIDs() and featIDs() is that they should be regarded as a minimum set of IDs to expect across all slots, but it is possible for individual slots to either have more IDs if they contain additional information or be in a different order.

In short: it is definitely preferred to add metadata with the cell_ID key column and by_column = TRUE. But omitting that key column or appending a vector or factor instead of a data.frame is also safer now.

drieslab / Giotto

addCellMetadata performs an unexpected sort #853