this pull request allows us to analyze an image in parts, and these parts can be parallelized. a square region is defined by its upper left coordinates, and cells from this region are loaded. then, tumor regions are loaded from this square along with a boundary. the size of this boundary can be set by the user. be aware that the size of this boundary is the upper limit of the distance from a cell to tumor region.
this method was tested with different grid sizes (the square region from which cells are loaded), and results are the same. yay. i also replicated results from loading the entire roi into memory.
closes #9
here is a use example:
import itertools
import math
import multiprocessing
from pathlib import Path
import tumor_microenv as tm
data_root = Path("data/Div_Data2")
all_patch_paths = list(data_root.glob("*.npy"))
# Sort by y then by x.
all_patch_paths.sort(key=lambda p: p.stem.split("_")[:2][::-1])
patch_size = 73
analysis_size = 1022 # factor of 6132
analysis_size = patch_size * math.ceil(analysis_size / patch_size)
print(f"analysis_size={analysis_size}")
def path_to_minx_miny(path):
path = Path(path)
splits = path.stem.split("_", maxsplit=4)
if len(splits) != 4:
raise ValueError("expected four values in filename")
minx, miny, _, _ = map(int, splits)
return minx, miny
x_y_coords = [path_to_minx_miny(p) for p in all_patch_paths]
xs = sorted({x for x, _ in x_y_coords})
ys = sorted({y for _, y in x_y_coords})
first_minx = xs[0]
first_miny = ys[0]
last_minx = xs[-1] + patch_size - analysis_size
last_miny = ys[-1] + patch_size - analysis_size
assert last_minx in xs
assert last_miny in ys
all_xs = list(range(first_minx, last_minx+1, analysis_size))
all_ys = list(range(first_miny, last_miny+1, analysis_size))
all_xs_ys = list(itertools.product(all_xs, all_ys))
print(f"number of grid cells: {len(all_xs_ys)}")
output_dir = Path("outputs_wsi")
# !rm -r "$output_dir"
output_dir.mkdir()
def run_one_roi(xy):
xmin, ymin = xy
patch_paths, cell_paths = tm.get_npy_and_json_files_for_roi(
xmin=xmin, ymin=ymin, patch_size=patch_size, analysis_size=analysis_size,
tumor_microenv=500, data_root="data/Div_Data2/")
loader = tm.LoaderV1(
patch_paths, cell_paths,
background=0,
marker_positive=1,
marker_negative=7,
)
patches, cells = loader()
cells = [c for c in cells if c.cell_type in {"cd4", "cd8", "cd16", "cd163"}]
# Get the centroid per cell.
cells = [c._replace(polygon=c.polygon.centroid) for c in cells]
tm.run_spatial_analysis(
patches, cells, microenv_distances=[100], mpp=0.34622,
output_path=output_dir / f"out-{xmin}-{ymin}.csv",
progress_bar=False,
)
with multiprocessing.Pool(8) as pool:
pool.map(run_one_roi, all_xs_ys)
and testing:
import numpy as np
import pandas as pd
df_wsi = pd.concat(pd.read_csv(p) for p in output_dir.glob("*.csv"))
df_org = pd.read_csv("output-25micron-100um-centroids.csv")
a = df_org.dist_to_marker_pos[df_org.dist_to_marker_pos < 100/0.34622].sort_values()
b = df_wsi.dist_to_marker_pos[df_wsi.dist_to_marker_pos < 100/0.34622].sort_values()
assert np.allclose(a, b)
this pull request allows us to analyze an image in parts, and these parts can be parallelized. a square region is defined by its upper left coordinates, and cells from this region are loaded. then, tumor regions are loaded from this square along with a boundary. the size of this boundary can be set by the user. be aware that the size of this boundary is the upper limit of the distance from a cell to tumor region.
this method was tested with different grid sizes (the square region from which cells are loaded), and results are the same. yay. i also replicated results from loading the entire roi into memory.
closes #9
here is a use example:
and testing: