Skip to content

Utils

novae.spatial_neighbors(adata, slide_key=None, radius=None, pixel_size=None, technology=None, coord_type=None, n_neighs=None, delaunay=None, n_rings=1, percentile=None, set_diag=False, reset_slide_ids=True)

Create a Delaunay graph from the spatial coordinates of the cells. The graph is stored in adata.obsp['spatial_connectivities'] and adata.obsp['spatial_distances']. The long edges are removed from the graph according to the radius argument (if provided).

Info

This function was updated from squidpy.

Parameters:

Name Type Description Default
adata AnnData | list[AnnData]

An AnnData object, or a list of AnnData objects.

required
slide_key str | None

Optional key in adata.obs indicating the slide ID of each cell. If provided, the graph is computed for each slide separately.

None
radius tuple[float, float] | float | None

tuple that prunes the final graph to only contain edges in interval [min(radius), max(radius)]. If float, uses [0, radius]. If None, all edges are kept.

None
technology str | SpatialTechnology | None

Technology or machine used to generate the spatial data. One of "cosmx", "merscope", "xenium", "visium", "visium_hd". If None, uses adata.obsm["spatial"].

None
coord_type str | CoordType | None

Either "grid" or "generic". If "grid", the graph is built on a grid. If "generic", the graph is built using the coordinates as they are. By default, uses "grid" for Visium/VisiumHD and "generic" for other technologies.

None
n_neighs int | None

Number of neighbors to consider. If None, uses 6 for Visium, 4 for Visium HD, and None for generic graphs.

None
delaunay bool | None

Whether to use Delaunay triangulation to build the graph. If None, uses False for grid-based graphs and True for generic graphs.

None
n_rings int

See squidpy.gr.spatial_neighbors documentation.

1
percentile float | None

See squidpy.gr.spatial_neighbors documentation.

None
set_diag bool

See squidpy.gr.spatial_neighbors documentation.

False
reset_slide_ids bool

Whether to reset the novae slide ids.

True
Source code in novae/utils/build.py
def spatial_neighbors(
    adata: AnnData | list[AnnData],
    slide_key: str | None = None,
    radius: tuple[float, float] | float | None = None,
    pixel_size: float | None = None,
    technology: str | SpatialTechnology | None = None,
    coord_type: str | CoordType | None = None,
    n_neighs: int | None = None,
    delaunay: bool | None = None,
    n_rings: int = 1,
    percentile: float | None = None,
    set_diag: bool = False,
    reset_slide_ids: bool = True,
):
    """Create a Delaunay graph from the spatial coordinates of the cells.
    The graph is stored in `adata.obsp['spatial_connectivities']` and `adata.obsp['spatial_distances']`. The long edges
    are removed from the graph according to the `radius` argument (if provided).

    Info:
        This function was updated from [squidpy](https://squidpy.readthedocs.io/en/latest/api/squidpy.gr.spatial_neighbors.html#squidpy.gr.spatial_neighbors).

    Args:
        adata: An `AnnData` object, or a list of `AnnData` objects.
        slide_key: Optional key in `adata.obs` indicating the slide ID of each cell. If provided, the graph is computed for each slide separately.
        radius: `tuple` that prunes the final graph to only contain edges in interval `[min(radius), max(radius)]`. If `float`, uses `[0, radius]`. If `None`, all edges are kept.
        technology: Technology or machine used to generate the spatial data. One of `"cosmx", "merscope", "xenium", "visium", "visium_hd"`. If `None`, uses `adata.obsm["spatial"]`.
        coord_type: Either `"grid"` or `"generic"`. If `"grid"`, the graph is built on a grid. If `"generic"`, the graph is built using the coordinates as they are. By default, uses `"grid"` for Visium/VisiumHD and `"generic"` for other technologies.
        n_neighs: Number of neighbors to consider. If `None`, uses `6` for Visium, `4` for Visium HD, and `None` for generic graphs.
        delaunay: Whether to use Delaunay triangulation to build the graph. If `None`, uses `False` for grid-based graphs and `True` for generic graphs.
        n_rings: See `squidpy.gr.spatial_neighbors` documentation.
        percentile: See `squidpy.gr.spatial_neighbors` documentation.
        set_diag: See `squidpy.gr.spatial_neighbors` documentation.
        reset_slide_ids: Whether to reset the novae slide ids.
    """
    if reset_slide_ids:
        _set_unique_slide_ids(adata, slide_key=slide_key)

    if isinstance(adata, list):
        for adata_ in adata:
            spatial_neighbors(
                adata_,
                slide_key=slide_key,
                radius=radius,
                pixel_size=pixel_size,
                technology=technology,
                coord_type=coord_type,
                n_neighs=n_neighs,
                delaunay=delaunay,
                n_rings=n_rings,
                percentile=percentile,
                set_diag=set_diag,
                reset_slide_ids=False,
            )
        return

    if isinstance(radius, (float, int)):
        radius = [0.0, float(radius)]

    assert radius is None or len(radius) == 2, "Radius is expected to be a tuple (min_radius, max_radius)"

    assert pixel_size is None or technology is None, "You must choose argument between `pixel_size` and `technology`"

    if technology == "visium":
        n_neighs = 6 if n_neighs is None else n_neighs
        coord_type, delaunay = CoordType.GRID, False
    elif technology == "visium_hd":
        n_neighs = 8 if n_neighs is None else n_neighs
        coord_type, delaunay = CoordType.GRID, False
    elif technology is not None:
        adata.obsm["spatial"] = _technology_coords(adata, technology)

    assert "spatial" in adata.obsm, (
        "Key 'spatial' not found in adata.obsm. This should contain the 2D spatial coordinates of the cells"
    )

    coord_type = CoordType(coord_type or "generic")
    delaunay = True if delaunay is None else delaunay
    n_neighs = 6 if (n_neighs is None and not delaunay) else n_neighs

    log.info(
        f"Computing graph on {adata.n_obs:,} cells (coord_type={coord_type.value}, {delaunay=}, {radius=}, {n_neighs=})"
    )

    slides = adata.obs[Keys.SLIDE_ID].cat.categories
    make_index_unique(adata.obs_names)

    _build_fun = partial(
        _spatial_neighbor,
        coord_type=coord_type,
        n_neighs=n_neighs,
        radius=radius,
        delaunay=delaunay,
        n_rings=n_rings,
        set_diag=set_diag,
        percentile=percentile,
    )

    if len(slides) > 1:
        mats: list[tuple[spmatrix, spmatrix]] = []
        ixs = []  # type: ignore[var-annotated]
        for slide in slides:
            ixs.extend(np.where(adata.obs[Keys.SLIDE_ID] == slide)[0])
            mats.append(_build_fun(adata[adata.obs[Keys.SLIDE_ID] == slide]))
        ixs = np.argsort(ixs)  # type: ignore[assignment] # invert
        Adj = block_diag([m[0] for m in mats], format="csr")[ixs, :][:, ixs]
        Dst = block_diag([m[1] for m in mats], format="csr")[ixs, :][:, ixs]
    else:
        Adj, Dst = _build_fun(adata)

    adata.obsp["spatial_connectivities"] = Adj
    adata.obsp["spatial_distances"] = Dst

    adata.uns["spatial_neighbors"] = {
        "connectivities_key": "spatial_connectivities",
        "distances_key": "spatial_distances",
        "params": {"radius": radius, "set_diag": set_diag, "n_neighbors": n_neighs, "coord_type": coord_type.value},
    }

    _sanity_check_spatial_neighbors(adata)

novae.batch_effect_correction(adatas, obs_key)

Source code in novae/utils/correct.py
def batch_effect_correction(adatas: list[AnnData], obs_key: str) -> None:
    for adata in adatas:
        assert obs_key in adata.obs, f"Did not found `adata.obs['{obs_key}']`"
        assert Keys.REPR in adata.obsm, (
            f"Did not found `adata.obsm['{Keys.REPR}']`. Please run `model.compute_representations(...)` first"
        )

    adata_indices, slides_obs_indices = _slides_indices(adatas)

    domains_counts_per_slide = _domains_counts_per_slide(adatas, obs_key)
    domains = domains_counts_per_slide.columns[:-1]
    ref_slide_ids: pd.Series = domains_counts_per_slide[domains].idxmax(axis=0)

    def _centroid_reference(domain: str, slide_id: str, obs_key: str):
        adata_ref_index: int = domains_counts_per_slide[Keys.ADATA_INDEX].loc[slide_id]
        adata_ref = adatas[adata_ref_index]
        where = (adata_ref.obs[Keys.SLIDE_ID] == slide_id) & (adata_ref.obs[obs_key] == domain)
        return adata_ref.obsm[Keys.REPR][where].mean(0)

    centroids_reference = pd.DataFrame({
        domain: _centroid_reference(domain, slide_id, obs_key) for domain, slide_id in ref_slide_ids.items()
    })

    for adata in adatas:
        adata.obsm[Keys.REPR_CORRECTED] = adata.obsm[Keys.REPR].copy()

    for adata_index, obs_indices in zip(adata_indices, slides_obs_indices):
        adata = adatas[adata_index]

        for domain in domains:
            if adata.obs[Keys.SLIDE_ID].iloc[obs_indices[0]] == ref_slide_ids.loc[domain]:
                continue  # reference for this domain

            indices_domain = obs_indices[adata.obs.iloc[obs_indices][obs_key] == domain]
            if len(indices_domain) == 0:
                continue

            centroid_reference = centroids_reference[domain].values
            centroid = adata.obsm[Keys.REPR][indices_domain].mean(0)

            adata.obsm[Keys.REPR_CORRECTED][indices_domain] += centroid_reference - centroid

novae.utils.prepare_adatas(adata, var_names=None)

Ensure the AnnData objects are ready to be used by the model.

Note

It performs the following operations:

  • Preprocess the data if needed (e.g. normalize, log1p), in which case raw counts are saved in adata.layers['counts']
  • Compute the mean and std of each gene
  • Save which genes are highly variable, in case the number of genes is too high
  • If using a pretrained model, save which genes are known by the model

Parameters:

Name Type Description Default
adata AnnData | list[AnnData] | None

An AnnData object, or a list of AnnData objects. Optional if the model was initialized with adata.

required
var_names set | list[str] | None

Only used when loading a pretrained model, or to select a subset of vars to use.

None

Returns:

Type Description
tuple[list[AnnData], list[str]]

A list of AnnData objects ready to be used by the model. If only one adata object is provided, it will be wrapped in a list.

Source code in novae/utils/_validate.py
def prepare_adatas(
    adata: AnnData | list[AnnData] | None,
    var_names: set | list[str] | None = None,
) -> tuple[list[AnnData], list[str]]:
    """Ensure the AnnData objects are ready to be used by the model.

    Note:
        It performs the following operations:

        - Preprocess the data if needed (e.g. normalize, log1p), in which case raw counts are saved in `adata.layers['counts']`
        - Compute the mean and std of each gene
        - Save which genes are highly variable, in case the number of genes is too high
        - If using a pretrained model, save which genes are known by the model

    Args:
        adata: An `AnnData` object, or a list of `AnnData` objects. Optional if the model was initialized with `adata`.
        var_names: Only used when loading a pretrained model, or to select a subset of vars to use.

    Returns:
        A list of `AnnData` objects ready to be used by the model. If only one `adata` object is provided, it will be wrapped in a list.
    """
    assert adata is not None or var_names is not None, "One of `adata` and `var_names` must not be None"
    var_names = lower_var_names(var_names) if var_names is not None else None

    if adata is None:
        return None, var_names

    if isinstance(adata, AnnData):
        adatas = [adata]
    elif isinstance(adata, list):
        adatas = adata
    else:
        raise TypeError(f"Invalid type for `adata`: {type(adata)}")

    assert len(adatas) > 0, "No `adata` object found. Please provide an AnnData object, or a list of AnnData objects."

    assert all(Keys.ADJ in adata.obsp for adata in adatas), (
        "You need to first run `novae.spatial_neighbors` to compute cell neighbors."
    )

    _check_has_slide_id(adatas)
    _standardize_adatas(adatas)  # log1p + spatial_neighbors

    if settings.auto_preprocessing:
        _lookup_highly_variable_genes(adatas)
    if not settings.disable_multimodal:
        _check_he_embeddings(adatas)

    _select_novae_genes(adatas, var_names)

    if var_names is None:
        var_names = _genes_union(adatas, among_used=True)

    return adatas, var_names