Open dpcook opened 4 months ago
Same issue:
path <- "/nfs/data/cortex/Cortex/Spatial/Xenium/output-XETG00373__0029029__brain1__20240628__111204"
list.files(path)
[1] "analysis" "analysis.zarr.zip" "analysis_summary_P11_Nx.html"
[4] "aux_outputs" "cell_boundaries.csv.gz" "cell_boundaries.parquet"
[7] "cell_feature_matrix" "cell_feature_matrix.h5" "cell_feature_matrix.zarr.zip"
[10] "cells.csv.gz" "cells.parquet" "cells.zarr.zip"
[13] "experiment.xenium" "gene_panel.json" "metrics_summary.csv"
[16] "morphology.ome.tif" "morphology_focus" "nucleus_boundaries.csv.gz"
[19] "nucleus_boundaries.parquet" "transcripts.parquet" "transcripts.zarr.zip"
Error both with LoadXenium and ReadXenium :
# Load the Xenium data
xenium.obj <- LoadXenium(path, fov = "fov")
10X data contains more than one type and is being returned as a list containing matrices of each type.
Error: File '/nfs/data/cortex/Cortex/Spatial/Xenium/output-XETG00373__0029029__brain1__20240628__111204/transcripts.csv.gz' does not exist or is non-readable
According to @dpcook , 10X is dropping transcripts.csv.gz
sessionInfo :
R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
locale:
[1] C
time zone: Etc/UTC
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_3.5.1 future_1.33.2 Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
loaded via a namespace (and not attached):
[1] deldir_2.0-4 pbapply_1.7-2 gridExtra_2.3 rlang_1.1.4
[5] magrittr_2.0.3 RcppAnnoy_0.0.22 spatstat.geom_3.3-2 matrixStats_1.3.0
[9] ggridges_0.5.6 compiler_4.3.3 png_0.1-8 vctrs_0.6.5
[13] reshape2_1.4.4 stringr_1.5.1 pkgconfig_2.0.3 fastmap_1.2.0
[17] utf8_1.2.4 promises_1.3.0 rmarkdown_2.27 purrr_1.0.2
[21] xfun_0.46 jsonlite_1.8.8 goftest_1.2-3 later_1.3.2
[25] spatstat.utils_3.0-5 irlba_2.3.5.1 parallel_4.3.3 cluster_2.1.6
[29] R6_2.5.1 ica_1.0-3 stringi_1.8.4 RColorBrewer_1.1-3
[33] spatstat.data_3.1-2 reticulate_1.38.0 parallelly_1.38.0 spatstat.univar_3.0-0
[37] lmtest_0.9-40 scattermore_1.2 Rcpp_1.0.13 knitr_1.48
[41] tensor_1.5 future.apply_1.11.2 zoo_1.8-12 R.utils_2.12.3
[45] sctransform_0.4.1 httpuv_1.6.15 Matrix_1.6-5 splines_4.3.3
[49] igraph_2.0.3 tidyselect_1.2.1 abind_1.4-5 rstudioapi_0.16.0
[53] yaml_2.3.10 spatstat.random_3.3-1 codetools_0.2-19 miniUI_0.1.1.1
[57] spatstat.explore_3.3-1 listenv_0.9.1 lattice_0.22-5 tibble_3.2.1
[61] plyr_1.8.9 withr_3.0.0 shiny_1.8.1.1 ROCR_1.0-11
[65] evaluate_0.24.0 Rtsne_0.17 fastDummies_1.7.3 survival_3.5-8
[69] polyclip_1.10-7 fitdistrplus_1.2-1 pillar_1.9.0 KernSmooth_2.23-22
[73] plotly_4.10.4 generics_0.1.3 RcppHNSW_0.6.0 munsell_0.5.1
[77] scales_1.3.0 globals_0.16.3 xtable_1.8-4 glue_1.7.0
[81] lazyeval_0.2.2 tools_4.3.3 data.table_1.15.4 RSpectra_0.16-2
[85] RANN_2.6.1 leiden_0.4.3.1 dotCall64_1.1-1 cowplot_1.1.3
[89] grid_4.3.3 tidyr_1.3.1 colorspace_2.1-1 nlme_3.1-164
[93] patchwork_1.2.0 cli_3.6.3 spatstat.sparse_3.1-0 spam_2.10-0
[97] fansi_1.0.6 viridisLite_0.4.2 dplyr_1.1.4 uwot_0.2.2
[101] gtable_0.3.5 R.methodsS3_1.8.2 digest_0.6.36 progressr_0.14.0
[105] ggrepel_0.9.5 htmlwidgets_1.6.4 R.oo_1.26.0 htmltools_0.5.8.1
[109] lifecycle_1.0.4 httr_1.4.7 mime_0.12 MASS_7.3-60.0.1
@dpcook Can you please share your code for the new xenium data load/read?
Sure thing. In ReadXenium()
I replaced
if (has_dt) {
tx_dt <- as.data.frame(data.table::fread(file.path(data.dir,
"transcripts.csv.gz")))
transcripts <- subset(tx_dt, qv >= mols.qv.threshold)
} else {
transcripts <- read.csv(file.path(data.dir, "transcripts.csv.gz"))
transcripts <- subset(transcripts, qv >= mols.qv.threshold)
}
With
transcripts <- arrow::read_parquet(file.path(data.dir, "transcripts.parquet"))
transcripts <- subset(transcripts, qv >= mols.qv.threshold)
So testing on the Prime 5k Lymph Node dataset from 10x, the following works for me:
#Redefine ReadXenium()
ReadXenium <- function (data.dir, outs = c("matrix", "microns"), type = "centroids",
mols.qv.threshold = 20)
{
type <- match.arg(arg = type, choices = c("centroids", "segmentations"),
several.ok = TRUE)
outs <- match.arg(arg = outs, choices = c("matrix", "microns"),
several.ok = TRUE)
outs <- c(outs, type)
has_dt <- requireNamespace("data.table", quietly = TRUE) &&
requireNamespace("R.utils", quietly = TRUE)
data <- sapply(outs, function(otype) {
switch(EXPR = otype, matrix = {
matrix <- suppressWarnings(Read10X(data.dir = file.path(data.dir,
"cell_feature_matrix/")))
matrix
}, centroids = {
if (has_dt) {
cell_info <- as.data.frame(data.table::fread(file.path(data.dir,
"cells.csv.gz")))
} else {
cell_info <- read.csv(file.path(data.dir, "cells.csv.gz"))
}
cell_centroid_df <- data.frame(x = cell_info$x_centroid,
y = cell_info$y_centroid, cell = cell_info$cell_id,
stringsAsFactors = FALSE)
cell_centroid_df
}, segmentations = {
if (has_dt) {
cell_boundaries_df <- as.data.frame(data.table::fread(file.path(data.dir,
"cell_boundaries.csv.gz")))
} else {
cell_boundaries_df <- read.csv(file.path(data.dir,
"cell_boundaries.csv.gz"), stringsAsFactors = FALSE)
}
names(cell_boundaries_df) <- c("cell", "x", "y")
cell_boundaries_df
}, microns = {
transcripts <- arrow::read_parquet(file.path(data.dir, "transcripts.parquet"))
transcripts <- subset(transcripts, qv >= mols.qv.threshold)
df <- data.frame(x = transcripts$x_location, y = transcripts$y_location,
gene = transcripts$feature_name, stringsAsFactors = FALSE)
df
}, stop("Unknown Xenium input type: ", otype))
}, USE.NAMES = TRUE)
return(data)
}
Run function
data <- ReadXenium(data.dir = "~/Downloads/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs/")
Finish setting up object. Note: I removed an if() statement that was looking for how one of the controls was named, and also just called the spatial assay "RNA".
#This is typically in LoadXenium()
segmentations.data <- list(centroids = CreateCentroids(data$centroids),
segmentation = CreateSegmentation(data$segmentations))
coords <- CreateFOV(coords = segmentations.data, type = c("segmentation",
"centroids"), molecules = data$microns, assay = "RNA")
xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]],
assay = "RNA")
xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Unassigned Codeword"]])
xenium.obj[["ControlCodeword"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Codeword"]])
xenium.obj[["ControlProbe"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Probe"]])
xenium.obj[["fov"]] <- coords
> xenium.obj
An object of class Seurat
7782 features across 708983 samples within 4 assays
Active assay: RNA (4624 features, 0 variable features)
1 layer present: counts
3 other assays present: BlankCodeword, ControlCodeword, ControlProbe
1 spatial field of view present: fov
Thanks David! (@dpcook)
a minor correction. Please confirm
segmentations.data <- list(centroids = CreateCentroids(data$centroids),
segmentation = CreateSegmentation(data$microns))
Ah yes, apologies, that was a mistake.
To follow-up, I find it did not load the segmentation
data as how LoadXenium
does.
LoadXenium
loads both c("centroids", "segmentations")
.
Therefore, here is the working solution. Still use the modified function here but loads data as LoadXenium
.
data <- ReadXenium(dir_in, outs = c("matrix", "microns"), type = c("centroids", "segmentations"))
names(data)
## continue the regular LoadXenium
segmentations.data <- list(
centroids = CreateCentroids(data$centroids),
segmentation = CreateSegmentation(data$segmentations))
coords <- CreateFOV(
coords = segmentations.data,
type = c("segmentation", "centroids"),
molecules = data$microns,
assay = "Xenium")
xmo <- CreateSeuratObject(
counts = data$matrix[["Gene Expression"]],
assay = "Xenium")
xmo[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Unassigned Codeword"]])
xmo[["ControlCodeword"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Codeword"]])
xmo[["ControlProbe"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Probe"]])
xmo[["fov"]] <- coords
print(xmo)
Please note, support for reading the .parquet files (and a variety of other improvements) are added in #8605. That is currently 10XGenomics/seurat@develop, which could be installed using devtools
or remotes
, if you don't want to make changes to the code yourself
@dpcook I agree, in R one can use library(arrow) to load the .parquet file
library(arrow)
transcripts <- read_parquet(file.path(data.dir, "transcripts.parquet"))
write.csv(transcripts, gzfile(file.path(data.dir, "transcripts.csv.gz")), row.names = FALSE)
And, there is another way to pre-process the data using python:
Import
zarr
andpandas
from pypiimport zarr, pandas as pd zarr_data = zarr.open("path/to/transcripts.zarr", mode='r') pd.DataFrame(zarr_data).to_csv("path/to/transcripts.csv.gz", index=False, compression="gzip")
OR
import zarr, pandas as pd pd.read_parquet("path/to/transcripts.parquet").to_csv("path/to/transcripts.csv.gz", index=False, compression="gzip")
Ideally, these approaches should not yield different results, if anyone finds out otherwise, please mention, thanks!
Please note, support for reading the .parquet files (and a variety of other improvements) are added in #8605. That is currently 10XGenomics/seurat@develop, which could be installed using
devtools
orremotes
, if you don't want to make changes to the code yourself
@jsicherman, thanks for fixing this, it works for me without making any changes to the data.
10x is dropping transcripts.csv.gz from the Xenium Bundle in the new release of the Xenium Onboard Analysis (release notes).
As
ReadXenium()
reads transcript data from the CSV, it no longer works. I was able to work around it by substituting inarrow::read_parquet("./transcripts.parquet")
. Alternatively, you could read in the zarr file, but I'm not familiar with that.