ComSeg on large scale data with SOPA / SpatialData

ComSeg is now integrated in SOPA : https://gustaveroussy.github.io/sopa/tutorials/comseg/. This tutorial is depreciated

To ease the application of ComSeg on large datasets, ComSeg can be used with SOPA: https://gustaveroussy.github.io/sopa/

Sopa is build on top of Spatial data

The following example is a modified version of the official SOPA tutorial : https://gustaveroussy.github.io/sopa/tutorials/api_usage/

and done with the version 1.0.14 of SOPA

[1]:

import pandas as pd
import sopa.segmentation
import sopa.io
from matplotlib import pyplot as plt
from tqdm import tqdm
from sopa._sdata import to_intrinsic
import sopa
from spatialdata import SpatialData, read_zarr
from spatialdata.models import PointsModel
from pathlib import Path

Load example data

[2]:

sdata = sopa.io.uniform()
image_key = "image"
points_key = "transcripts"
gene_column = "genes"

Segmentation of nuclei with Cellpose

The nucleus segmentation will be used as prior by ComSeg

[3]:

patches = sopa.segmentation.Patches2D(sdata, image_key, patch_width=1200, patch_overlap=50)
patches.write()
from sopa._sdata import get_spatial_image
print(get_spatial_image(sdata, image_key).c.values)
channels = ["DAPI"]
method = sopa.segmentation.methods.cellpose_patch(diameter=35, channels=channels, flow_threshold=2, cellprob_threshold=-6)
segmentation = sopa.segmentation.StainingSegmentation(sdata, method, channels, min_area=2500)


# The cellpose boundaries will be temporary saved here. You can choose a different path
cellpose_temp_dir = "tuto.zarr/.sopa_cache/cellpose"
segmentation.write_patches_cells(cellpose_temp_dir)

cells = sopa.segmentation.StainingSegmentation.read_patches_cells(cellpose_temp_dir)
cells = sopa.segmentation.shapes.solve_conflicts(cells)
shapes_key = "cellpose_boundaries" # name of the key given to the cells in sdata.shapes
sopa.segmentation.StainingSegmentation.add_shapes(sdata, cells, image_key, shapes_key)

Compute the patches for ComSeg

[4]:

image_key = "image"
points_key = "transcripts" # (ignore this for multiplex imaging)
gene_column = "genes" # (option
config_comseg = {}
baysor_temp_dir = "tuto.zarr/.sopa_cache/comseg"

patches = sopa.segmentation.Patches2D(sdata, points_key, patch_width=200, patch_overlap=50)
valid_indices = patches.patchify_transcripts(baysor_temp_dir, config=config_comseg, use_prior=True)

Compute centroid for ComSeg

[5]:

def add_centroids_to_sdata(sdata,
                           points_key="transcripts",
                           shapes_key='cellpose_boundaries', z_constant=None):
    centroid = sdata[shapes_key].geometry.centroid
    x_centroid = list(centroid.geometry.x)
    y_centroid = list(centroid.geometry.y)
    if z_constant is not None:
        z_centroid = [z_constant] * len(y_centroid)
        coords = pd.DataFrame({"x": x_centroid, "y": y_centroid, "z": z_centroid})

    else:
        if "z" in sdata[points_key].columns:
            z = list(sdata[points_key].z.unique().compute())
            assert len(z)==1, "3D point cloud with 2D segmentation, manually set z_constant"
            z_centroid = [z[0]] * len(y_centroid)
            coords = pd.DataFrame({"x": x_centroid, "y": y_centroid, "z": z_centroid})
        else:
            coords = pd.DataFrame({"x": x_centroid, "y": y_centroid})
    points = PointsModel.parse(coords)
    sdata['centroid'] = points
    sdata['centroid'] = to_intrinsic(sdata, sdata['centroid'], points_key)
    return sdata

sdata_centroid = SpatialData()
sdata_centroid['cellpose_boundaries'] = sdata['cellpose_boundaries']
sdata_centroid['transcripts'] = sdata['transcripts']
sdata_centroid = add_centroids_to_sdata(sdata_centroid,
                           points_key="transcripts",
                           shapes_key='cellpose_boundaries',
                               z_constant=1)


baysor_temp_dir = "tuto.zarr/.sopa_cache/comseg_centroid"
config_comseg ={}
points_key = "centroid"
patches = sopa.segmentation.Patches2D(sdata_centroid, points_key, patch_width=200, patch_overlap=50)
valid_indices = patches.patchify_transcripts(baysor_temp_dir, config=config_comseg, use_prior=True)

Run ComSeg on each patch

[9]:

import comseg
from comseg import dataset as ds
from comseg import dictionary
from comseg import model
import json

#### HYPERPARAMETER ####
MEAN_CELL_DIAMETER = 15  # in micrometer
MAX_CELL_RADIUS = 50  # in micrometer
#########################

path_transcript = "tuto.zarr/.sopa_cache/comseg"
path_centroid = "tuto.zarr/.sopa_cache/comseg_centroid"

for patch_index in tqdm(list(range(len(patches.ilocs)))):
    path_dataset_folder = Path(path_transcript) / str(patch_index)
    path_dataset_folder_centroid = Path(path_centroid) / str(patch_index)

    dataset = ds.ComSegDataset(
        path_dataset_folder=path_dataset_folder,
        dict_scale={"x": 1, 'y': 1, "z": 1},
        mean_cell_diameter = MEAN_CELL_DIAMETER,
        gene_column = "genes",
        )


    dico_proba_edge, count_matrix = dataset.compute_edge_weight(  # in micrometer
    images_subset=None,
    n_neighbors=40,
    sampling=True,
    sampling_size=10000
    )

    Comsegdict = dictionary.ComSegDict(
    dataset=dataset,
    mean_cell_diameter=MEAN_CELL_DIAMETER,
    community_detection="with_prior",
    prior_name="cell",
    )
    Comsegdict.run_all(max_cell_radius = MAX_CELL_RADIUS,
                        path_dataset_folder_centroid=path_dataset_folder_centroid,
                               file_extension=".csv")



    anndata_comseg, json_dict = Comsegdict.anndata_from_comseg_result(
        return_polygon = True,
        alpha = 0.5,
        min_rna_per_cell = 5)
    anndata_comseg.write_loom(path_dataset_folder / 'segmentation_counts.loom')
    ## save the json_dict as json
    with open(path_dataset_folder / "segmentation_polygons.json", 'w') as f:
        json.dump(json_dict['transcripts'], f)

Aggregate results

[7]:

from sopa.segmentation.baysor.resolve import resolve
resolve(sdata, path_transcript, gene_column, min_area=10)
shapes_key = "baysor_boundaries"
aggregator = sopa.segmentation.Aggregator(sdata, image_key=image_key, shapes_key=shapes_key)
aggregator.compute_table(gene_column=gene_column, average_intensities=True)
sdata

Plot results

[8]:

import spatialdata_plot
sdata.pl.render_points(size=0.01, color="r")\
    .pl.render_images()\
    .pl.render_shapes(shapes_key, outline=True, fill_alpha=0, outline_color="w")\
    .pl.show("global")
plt.show()

[ ]: