add whole slide image analysis workflow

this pull request allows us to analyze an image in parts, and these parts can be parallelized. a square region is defined by its upper left coordinates, and cells from this region are loaded. then, tumor regions are loaded from this square along with a boundary. the size of this boundary can be set by the user. be aware that the size of this boundary is the upper limit of the distance from a cell to tumor region.

this method was tested with different grid sizes (the square region from which cells are loaded), and results are the same. yay. i also replicated results from loading the entire roi into memory.

closes #9

here is a use example:

import itertools
import math
import multiprocessing
from pathlib import Path
import tumor_microenv as tm

data_root = Path("data/Div_Data2")
all_patch_paths = list(data_root.glob("*.npy"))
# Sort by y then by x.
all_patch_paths.sort(key=lambda p: p.stem.split("_")[:2][::-1])
patch_size = 73

analysis_size = 1022  # factor of 6132
analysis_size = patch_size * math.ceil(analysis_size / patch_size)
print(f"analysis_size={analysis_size}")

def path_to_minx_miny(path):
    path = Path(path)
    splits = path.stem.split("_", maxsplit=4)
    if len(splits) != 4:
        raise ValueError("expected four values in filename")
    minx, miny, _, _ = map(int, splits)
    return minx, miny

x_y_coords = [path_to_minx_miny(p) for p in all_patch_paths]
xs = sorted({x for x, _ in x_y_coords})
ys = sorted({y for _, y in x_y_coords})

first_minx = xs[0]
first_miny = ys[0]
last_minx = xs[-1] + patch_size - analysis_size
last_miny = ys[-1] + patch_size - analysis_size
assert last_minx in xs
assert last_miny in ys

all_xs = list(range(first_minx, last_minx+1, analysis_size))
all_ys = list(range(first_miny, last_miny+1, analysis_size))

all_xs_ys = list(itertools.product(all_xs, all_ys))
print(f"number of grid cells: {len(all_xs_ys)}")

output_dir = Path("outputs_wsi")
# !rm -r "$output_dir"
output_dir.mkdir()

def run_one_roi(xy):
    xmin, ymin = xy
    patch_paths, cell_paths = tm.get_npy_and_json_files_for_roi(
        xmin=xmin, ymin=ymin, patch_size=patch_size, analysis_size=analysis_size, 
        tumor_microenv=500, data_root="data/Div_Data2/")
    loader = tm.LoaderV1(
        patch_paths, cell_paths,
        background=0, 
        marker_positive=1, 
        marker_negative=7,
    )
    patches, cells = loader()
    cells = [c for c in cells if c.cell_type in {"cd4", "cd8", "cd16", "cd163"}]
    # Get the centroid per cell.
    cells = [c._replace(polygon=c.polygon.centroid) for c in cells]
    tm.run_spatial_analysis(
        patches, cells, microenv_distances=[100], mpp=0.34622, 
        output_path=output_dir / f"out-{xmin}-{ymin}.csv",
        progress_bar=False,
    )

with multiprocessing.Pool(8) as pool:
    pool.map(run_one_roi, all_xs_ys)

and testing:

import numpy as np
import pandas as pd

df_wsi = pd.concat(pd.read_csv(p) for p in output_dir.glob("*.csv"))
df_org = pd.read_csv("output-25micron-100um-centroids.csv")
a = df_org.dist_to_marker_pos[df_org.dist_to_marker_pos < 100/0.34622].sort_values()
b = df_wsi.dist_to_marker_pos[df_wsi.dist_to_marker_pos < 100/0.34622].sort_values()
assert np.allclose(a, b)

kaczmarj / tumor-microenvironment-analysis

add whole slide image analysis workflow #14