wanglab-broad / spin

SPatial INtegration of spatially resolved transcriptomics datasets
GNU General Public License v3.0
13 stars 0 forks source link

Quick R implementation #3

Open NKalavros opened 1 year ago

NKalavros commented 1 year ago

Hi Kamal,

I saw your talk and I really liked the simplicity and scalability of the method you described. I also tried it on some other tissues except brain and it seems to be actually capturing structure! Since I mostly use R, I transcribed the one sample (no harmony) implementation to it and it scales pretty well.

Leaving this here in case you're interested.

Cheers!

library(Seurat)
library(data.table)
maher_smooth <- function(seur_obj,nn_graph,n_samples=NULL,n_nbrs=30,assay="Spatial",
                         layer="counts"){
    print("Converting data to data.table")
    #Grab expression data
    mat = GetAssayData(seur_obj,assay=assay,slot=layer)
    #Dense matrix has better column access...
    mat = as.data.table(mat)
    print("Conversion complete.")
    if(is.null(n_samples)){
        n_samples = round(n_nbrs/3,0)
    }
    #Sample neighbors for each cell
    print("Sampling neighbors")
    sampled = apply(nn_graph,1,function(x) sample(x,n_samples))
    #Turn into list
    sampled = as.list(as.data.frame(sampled))
    #Calculate new representation. Can take a while, using DT for it now. <1 min for 55k spots
    new_mat = lapply(sampled, function(x) rowMeans(mat[,..x]))
    new_mat = do.call(cbind, new_mat)
    return(new_mat)
}
#Not implementing the integration part for now. Suppose we only have 1 sample
maher_get_neighbors <- function(seur_obj,n_nbrs){
    #Grab coordinates. If your coordinates are not there, just add them to 
    #@images$slice_1@coordinates as a named df (cells x c(x,y))
    coords = GetTissueCoordinates(seur_obj)[,c(1,2)]
    #Incudes self for now. Use seurat to get the kNN, grab indices of top k
    nns = FindNeighbors(as.matrix(coords),k.param = n_nbrs,compute.SNN=F,return.neighbor = TRUE)
    nns = nns@nn.idx[,1:n_nbrs]
    return(nns)
}
#Wrapper
maher_spin <- function(seur_obj,n_nbrs = 30, n_samples=NULL,n_pcs=30,
                       random_state= 0,assay="Spatial",layer="counts",
                      resolution=0.5){
    set.seed(random_state)
    print("Computing nearest neighbors")
    nns = maher_get_neighbors(seur_obj,n_nbrs = n_nbrs)
    print("Computing Smoothed representations")
    new_repr = maher_smooth(seur_obj,nns,n_nbrs=n_nbrs,assay = assay,layer=layer,n_samples=n_samples)
    #Add new representation as Assay
    rownames(new_repr) = rownames(seur_obj)
    print("Normalizing")
    #Sparsify
    new_repr = as.sparse(new_repr)
    seur_obj_spin = CreateSeuratObject(counts = new_repr,meta.data = seur_obj@meta.data,verbose=F)
    seur_obj_spin = NormalizeData(seur_obj_spin,verbose=F)
    seur_obj_spin = FindVariableFeatures(seur_obj_spin,verbose=F)
    seur_obj_spin = ScaleData(seur_obj_spin,assay="RNA",verbose=F)
    print("PCA, SNN, UMAP, Louvain.")
    seur_obj_spin <- RunPCA(seur_obj_spin,verbose=F) 
    seur_obj_spin <- FindNeighbors(seur_obj_spin,dims=1:n_pcs,verbose=F)
    seur_obj_spin <- RunUMAP(seur_obj_spin,
                        dims=1:n_pcs,verbose=F)
    seur_obj_spin <- FindClusters(seur_obj_spin,resolution=resolution,verbose=F)
    domains = seur_obj_spin$seurat_clusters
    names(domains) = NULL
    seur_obj$SPINDomain = domains
    return(list(seur_obj,seur_obj_spin))
}
kmaherx commented 1 year ago

Glad to hear it's useful! I'm not nearly as familiar with R, so I really appreciate this implementation. Will leave this issue open for now in case others are interested in building on it.

roanvanscheppingen commented 7 months ago

Thank you @kmaherx for the development of SPIN and the elegancy by which you solve this spatial problem! I am hoping to incorporate SPIN in our analysis since I believe it would make our celltyping more robust. @NKalavros I was wondering how you would tackle multiple samples in R.

As far as I understood it now, you want to smooth before integration. I have a CosMx slide with multiple tissues on a single slide and thus in a single Seurat. Would you then split the Seurat based on the slides, do the smoothing, and then proceed with integration. Are there preferred integration methods like Harmony? I am currently working with NormalizeData from Seurat 5, since SCTransform gives trouble downstream with merging of the split Seurat and having different SCTransform models.

Considering we sometimes encounter sparse cell detection, would you further change SPIN on this? I could assume that spare cell identification would lead to neighbours at a greater distance, thus leading to less accurate subsampling if the number of neighbours is kept constant. Would it make sense to sample less, or sample only something within X distance to the centre?

roanvanscheppingen commented 6 months ago

I have updated the R implementation kindly provided by @NKalavros .

Now requires library(RANN) as well.

maher_smooth <- function(seur_obj,nn_graph,n_samples=NULL,n_nbrs=30,assay="RNA",
                         layer="counts"){
    print("Converting data to data.table")
    #Grab expression data
    mat = GetAssayData(seur_obj, layer = "counts")
    #Dense matrix has better column access...
    mat = as.data.table(mat)
    print("Conversion complete.")
    if(is.null(n_samples)){
        n_samples = round(n_nbrs/3,0)
    }
    #Sample neighbors for each cell
    print("Sampling neighbors")
    sampled = apply(nn_graph,1,function(x) sample(x,n_samples))
    #Turn into list
    sampled = as.list(as.data.frame(sampled))
    #Calculate new representation. Can take a while, using DT for it now. <1 min for 55k spots
    new_mat = lapply(sampled, function(x) rowMeans(mat[,..x]))
    new_mat = do.call(cbind, new_mat)
    ##Hashed it because otherwise it gives a list of 2 Seurats 
    #return(new_mat)
}

maher_get_neighbors <- function(seur_obj, n_nbrs) {
    # Initialize an empty list to store nearest neighbor indices for each TMA
    nns_list <- list()

    # Loop over each unique TMA in the metadata
    for (tma_id in unique(seur_obj@meta.data$TMA)) {
        # Filter the Seurat object for the current TMA
        seur_obj_subset <- subset(seur_obj, subset = TMA == tma_id)

        # Extract coordinates for the current TMA
        coords_matrix <- data.frame(x = seur_obj_subset$x_slide_mm, y = seur_obj_subset$y_slide_mm)

        # Perform nearest neighbor search for the current TMA
        nns_x <- nn2(coords_matrix, k = n_nbrs)$nn.idx

        # Replace indices with row names
        rownames(nns_x) <- rownames(seur_obj_subset@meta.data)

        # Sort the nearest neighbor matrix based on row names
        nns_x <- nns_x[order(rownames(nns_x)), ]

        # Store the nearest neighbor indices for the current TMA in the list
        nns_list[[tma_id]] <- nns_x
    }

    # Merge all lists into a single matrix
    nns <- do.call(rbind, nns_list)

    # Return the merged nearest neighbor matrix
    #return(nns)
}
maher_spin <- function(seur_obj,n_nbrs = 30, n_samples=NULL,n_pcs=30,
                       random_state= 0,assay="Spatial",layer="counts",
                      resolution=0.5){
    set.seed(random_state)
    print("Computing nearest neighbors")
    nns = maher_get_neighbors(seur_obj,n_nbrs = n_nbrs)
    print("Computing Smoothed representations")
    new_repr = maher_smooth(seur_obj,nns,n_nbrs=n_nbrs,assay = assay,layer=layer,n_samples=n_samples)
    #Add new representation as Assay
    rownames(new_repr) = rownames(seur_obj)
    print("Normalizing")
    #Sparsify
    new_repr = as.sparse(new_repr)
    new_repr <- new_repr
    seur_obj_spin = CreateSeuratObject(counts = new_repr, meta.data = seur_obj@meta.data)
    print("create Seurat done")

    #Start integration 
    seur_obj_spin[["RNA"]] <- split(seur_obj_spin[["RNA"]], f = seur_obj_spin$TMA)
    seur_obj_spin <- NormalizeData(seur_obj_spin, normalization.method = "LogNormalize", scale.factor = 10000)
    print("Normalize done")
    print("FindVariableFeatures")
    seur_obj_spin <- FindVariableFeatures(seur_obj_spin, selection.method = "vst", nfeatures = 100)
    print("FindVariableFeatures done")
    print("Scale data")
    seur_obj_spin = ScaleData(seur_obj_spin, vars.to.regress = c("nCount_RNA"))
    print("PCA")
    seur_obj_spin <- RunPCA(seur_obj_spin,verbose=F) 
    print("Integration")
    seur_obj_spin <- IntegrateLayers(object = seur_obj_spin, method = RPCAIntegration, verbose = F)
    print("SNN, UMAP, Louvain")
    seur_obj_spin <- FindNeighbors(seur_obj_spin,dims=1:n_pcs,verbose=F)
    seur_obj_spin <- RunUMAP(seur_obj_spin,
                        dims=1:n_pcs,verbose=F)
    seur_obj_spin <- FindClusters(seur_obj_spin,resolution=resolution,verbose=F)
    domains = seur_obj_spin$seurat_clusters
    names(domains) = NULL
    seur_obj$SPINDomain = domains
    return(seur_obj_spin)
}