Skip to content

Tools

scyan.tools.umap(adata, markers=None, obsm=None, n_cells=200000, min_dist=0.5, obsm_key='X_umap', filter=None, **umap_kwargs)

Run a UMAP on a specific set of markers (or all markers by default). It can be useful to show differences that are due to some markers of interest, instead of using the whole panel.

Info

This function returns a UMAP reducer. You can reuse it with reducer.transform(...) or save it with scyan.data.add.

Note

To actually plot the UMAP, use scyan.plot.umap.

Parameters:

Name Type Description Default
adata AnnData

An AnnData object.

required
markers Optional[List[str]]

List marker names. By default, use all the panel markers, i.e., adata.var_names.

None
obsm Optional[str]

Name of the obsm to consider to train the UMAP. By default, uses adata.X.

None
n_cells Optional[int]

Number of cells to be considered for the UMAP (to accelerate it when \(N\) is very high). If None, consider all cells.

200000
min_dist float

Min dist UMAP parameter.

0.5
obsm_key str

Key for adata.obsm to add the embedding.

'X_umap'
filter Optional[Tuple]

Optional tuple (key, value) used to train the UMAP on a set of cells that satisfies a constraint. key is the key of adata.obs to consider, and value the value the cells need to have.

None
**umap_kwargs int

Optional kwargs to provide to the UMAP initialization.

{}

Returns:

Type Description
UMAP

UMAP reducer.

Source code in scyan/tools/representation.py
def umap(
    adata: AnnData,
    markers: Optional[List[str]] = None,
    obsm: Optional[str] = None,
    n_cells: Optional[int] = 200_000,
    min_dist: float = 0.5,
    obsm_key: str = "X_umap",
    filter: Optional[Tuple] = None,
    **umap_kwargs: int,
) -> UMAP:
    """Run a [UMAP](https://umap-learn.readthedocs.io/en/latest/) on a specific set of markers (or all markers by default). It can be useful to show differences that are due to some markers of interest, instead of using the whole panel.

    !!! info

        This function returns a UMAP reducer. You can reuse it with `reducer.transform(...)` or save it with [scyan.data.add][].

    !!! note

        To actually plot the UMAP, use [scyan.plot.umap][].

    Args:
        adata: An `AnnData` object.
        markers: List marker names. By default, use all the panel markers, i.e., `adata.var_names`.
        obsm: Name of the obsm to consider to train the UMAP. By default, uses `adata.X`.
        n_cells: Number of cells to be considered for the UMAP (to accelerate it when $N$ is very high). If `None`, consider all cells.
        min_dist: Min dist UMAP parameter.
        obsm_key: Key for `adata.obsm` to add the embedding.
        filter: Optional tuple `(key, value)` used to train the UMAP on a set of cells that satisfies a constraint. `key` is the key of `adata.obs` to consider, and `value` the value the cells need to have.
        **umap_kwargs: Optional kwargs to provide to the `UMAP` initialization.

    Returns:
        UMAP reducer.
    """
    reducer = UMAP(min_dist=min_dist, **umap_kwargs)

    if markers is None:
        markers = adata.var_names

    adata.obsm[obsm_key] = np.zeros((adata.n_obs, 2))
    indices = _get_subset_indices(adata.n_obs, n_cells)
    adata_view = adata[indices, markers]
    X = adata_view.X if obsm is None else adata_view.obsm[obsm]

    _check_is_processed(X)

    log.info("Fitting UMAP...")
    if filter is None:
        embedding = reducer.fit_transform(X)
    else:
        key, value = filter
        reducer.fit(X[adata[indices].obs[key] == value])
        log.info("Transforming...")
        embedding = reducer.transform(X)

    adata.obsm[obsm_key][indices] = embedding

    return reducer

scyan.tools.leiden(adata, resolution=1, key_added='leiden', n_neighbors=15)

Leiden clustering

Parameters:

Name Type Description Default
adata AnnData

AnnData object.

required
resolution float

Resolution of the clustering.

1
key_added str

Name of the key of adata.obs where clusters will be saved.

'leiden'
n_neighbors int

Number of neighbors.

15
Source code in scyan/tools/representation.py
def leiden(
    adata: AnnData,
    resolution: float = 1,
    key_added: str = "leiden",
    n_neighbors: int = 15,
) -> None:
    """Leiden clustering

    Args:
        adata: AnnData object.
        resolution: Resolution of the clustering.
        key_added: Name of the key of adata.obs where clusters will be saved.
        n_neighbors: Number of neighbors.
    """
    try:
        import leidenalg
    except:
        raise ImportError(
            """To run leiden, you need to have 'leidenalg' installed. You can install the population discovery extra with "pip install 'scyan[discovery]'", or directly install leidenalg with "conda install -c conda-forge leidenalg"."""
        )

    import igraph as ig
    from sklearn.neighbors import kneighbors_graph

    if not "knn_graph" in adata.obsp:
        adata.obsp["knn_graph"] = kneighbors_graph(
            adata.X, n_neighbors=n_neighbors, metric="euclidean", include_self=False
        )

    # TODO (improvement): add weights according to euclidean distance
    graph = ig.Graph.Weighted_Adjacency(adata.obsp["knn_graph"], mode="DIRECTED")

    partition = leidenalg.find_partition(
        graph,
        leidenalg.RBConfigurationVertexPartition,
        resolution_parameter=resolution,
    )
    adata.obs[key_added] = pd.Categorical([str(x) for x in partition.membership])

scyan.tools.subcluster(adata, population, markers=None, key='scyan_pop', resolution=0.2, size_ratio_th=0.02, min_cells_th=200, n_cells=100000)

Create sub-clusters among a given populations, and filters small clusters according to (i) a minimum number of cells and (ii) a minimum ratio of cells.

Info

After having run this method, you can analyze the results with scyan.plot.umap and scyan.plot.pops_expressions.

Parameters:

Name Type Description Default
adata AnnData

An AnnData object.

required
population str

Name of the population to target (one of adata.obs[key]).

required
markers Optional[List[str]]

Optional list of markers used to create subclusters. By default, uses the complete panel.

None
key str

Key to look for population in adata.obs. By default, uses the model predictions, but you can also choose a population level (if any), or other observations.

'scyan_pop'
resolution float

Resolution used for leiden clustering. Higher resolution leads to more clusters.

0.2
size_ratio_th float

(Only used if population is None): Minimum ratio of cells to be considered as a significant cluster (compared to the parent cluster).

0.02
min_cells_th int

(Only used if population is None): Minimum number of cells to be considered as a significant cluster.

200
n_cells int

Number of cells to be considered for the subclustering (to accelerate it when \(N\) is very high). If None, consider all cells.

100000
Source code in scyan/tools/representation.py
def subcluster(
    adata: AnnData,
    population: str,
    markers: Optional[List[str]] = None,
    key: str = "scyan_pop",
    resolution: float = 0.2,
    size_ratio_th: float = 0.02,
    min_cells_th: int = 200,
    n_cells: int = 100_000,
) -> None:
    """Create sub-clusters among a given populations, and filters small clusters according to (i) a minimum number of cells and (ii) a minimum ratio of cells.
    !!! info
        After having run this method, you can analyze the results with [scyan.plot.umap][] and [scyan.plot.pops_expressions][].

    Args:
        adata: An `AnnData` object.
        population: Name of the population to target (one of `adata.obs[key]`).
        markers: Optional list of markers used to create subclusters. By default, uses the complete panel.
        key: Key to look for population in `adata.obs`. By default, uses the model predictions, but you can also choose a population level (if any), or other observations.
        resolution: Resolution used for leiden clustering. Higher resolution leads to more clusters.
        size_ratio_th: (Only used if `population` is `None`): Minimum ratio of cells to be considered as a significant cluster (compared to the parent cluster).
        min_cells_th: (Only used if `population` is `None`): Minimum number of cells to be considered as a significant cluster.
        n_cells: Number of cells to be considered for the subclustering (to accelerate it when $N$ is very high). If `None`, consider all cells.
    """
    leiden_key = f"leiden_{resolution}_{population}"
    subcluster_key = f"scyan_subcluster_{population}"
    condition = adata.obs[key] == population
    markers = list(adata.var_names if markers is None else markers)

    if leiden_key in adata.obs and adata.uns.get(leiden_key, []) == markers:
        log.info(
            "Found leiden labels with the same resolution. Skipping leiden clustering."
        )
        indices = np.where(~adata.obs[leiden_key].isna())[0]
        adata_sub = adata[indices, markers].copy()
    else:
        has_umap = _has_umap(adata)
        if has_umap.all() or condition.sum() <= n_cells:
            indices = _get_subset_indices(condition.sum(), n_cells)
            indices = np.where(condition)[0][indices]
        else:
            indices = _get_subset_indices((condition & has_umap).sum(), n_cells)
            indices = np.where(condition & has_umap)[0][indices]

            k = len(indices)
            if k < n_cells:
                indices2 = _get_subset_indices((condition & ~has_umap).sum(), n_cells - k)
                indices2 = np.where(condition & ~has_umap)[0][indices2]
                indices = np.concatenate([indices, indices2])

        adata_sub = adata[indices, markers].copy()

        leiden(adata_sub, resolution, leiden_key)

    series = pd.Series(index=np.arange(adata.n_obs), dtype=str)
    series[indices] = adata_sub.obs[leiden_key].values
    adata.obs[leiden_key] = series.values
    adata.obs[leiden_key] = adata.obs[leiden_key].astype("category")

    counts = adata_sub.obs[leiden_key].value_counts()
    remove = counts < max(counts.sum() * size_ratio_th, min_cells_th)

    assert (
        not remove.all()
    ), "All subclusters where filtered. Consider updating size_ratio_th and/or min_cells_th."

    adata_sub.obs.loc[
        np.isin(adata_sub.obs[leiden_key], remove[remove].index), leiden_key
    ] = np.nan

    series = pd.Series(index=np.arange(adata.n_obs), dtype=str)
    series[indices] = adata_sub.obs[leiden_key].values
    adata.obs[subcluster_key] = series.values
    adata.obs[subcluster_key] = adata.obs[subcluster_key].astype("category")

    adata.uns[leiden_key] = markers
    log.info(
        f"Subclusters created, you can now use:\n   - scyan.plot.umap(adata, color='{subcluster_key}') to show the clusters\n   - scyan.plot.pops_expressions(model, key='{subcluster_key}') to plot their expressions"
    )

scyan.tools.palette_level(table, population_index=0, level_index=1, hue_shift=0.4, alpha_l=0.25, step_l=0.15, alpha_s=0.3, step_s=0.4)

Computes a color palette that in grouped by the hierarchical main populations. It improves the UMAP readability when many populations are defined.

Info

Once such a color palette is defined, you can use it for plotting. For instance, try scyan.plot.umap(adata, color="scyan_pop", palette=palette), where palette is the one you created with this function.

Parameters:

Name Type Description Default
table DataFrame

Knowledge table provided to Scyan. It must be a multi-index DataFrame.

required
population_index Union[int, str]

Index or name of the level in table.index storing the low-level/children population names.

0
level_index Union[int, str]

Index or name of the level in table.index storing the main population names.

1
hue_shift float

Shift the hue values. The value must be a float in [0, 1].

0.4
alpha_l float

Lower it to have a larger lightness range of colors.

0.25
step_l float

Increase it to have more distinct colors (in term of lightness).

0.15
alpha_s float

Lower it to have a larger saturation range of colors.

0.3
step_s float

Increase it to have more distinct colors (in term of saturation).

0.4

Returns:

Type Description
Dict[str, Tuple[float]]

A dictionnary whose keys are population names and values are RGB colors.

Source code in scyan/tools/colors.py
def palette_level(
    table: pd.DataFrame,
    population_index: Union[int, str] = 0,
    level_index: Union[int, str] = 1,
    hue_shift: float = 0.4,
    alpha_l: float = 0.25,
    step_l: float = 0.15,
    alpha_s: float = 0.3,
    step_s: float = 0.4,
) -> Dict[str, Tuple[float]]:
    """Computes a color palette that in grouped by the hierarchical main populations. It improves the UMAP readability when many populations are defined.

    !!! info
        Once such a color palette is defined, you can use it for plotting. For instance, try `scyan.plot.umap(adata, color="scyan_pop", palette=palette)`, where `palette` is the one you created with this function.

    Args:
        table: Knowledge table provided to Scyan. It must be a multi-index DataFrame.
        population_index: Index or name of the level in `table.index` storing the low-level/children population names.
        level_index: Index or name of the level in `table.index` storing the main population names.
        hue_shift: Shift the hue values. The value must be a float in `[0, 1]`.
        alpha_l: Lower it to have a larger lightness range of colors.
        step_l: Increase it to have more distinct colors (in term of lightness).
        alpha_s: Lower it to have a larger saturation range of colors.
        step_s: Increase it to have more distinct colors (in term of saturation).

    Returns:
        A dictionnary whose keys are population names and values are RGB colors.
    """
    assert isinstance(
        table.index, pd.MultiIndex
    ), f"The provided table has no multi-index. To work with hierarchical populations, consider reading https://mics-lab.github.io/scyan/tutorials/usage/#working-with-hierarchical-populations"

    pops = table.index.get_level_values(population_index).values
    level = table.index.get_level_values(level_index)
    level_counts = level.value_counts()

    group_palette = GroupPalette(alpha_l, step_l, alpha_s, step_s)
    color_groups = group_palette(level_counts.values, hue_shift)

    block_indices = [level_counts.index.get_loc(pop) for pop in level]
    s = pd.Series(level)
    inner_block_indices = s.groupby(s).cumcount().values

    return {
        pop: list(color_groups[block_index][inner_index])
        for pop, block_index, inner_index in zip(pops, block_indices, inner_block_indices)
    }

scyan.tools.cell_type_ratios(adata, groupby=None, normalize=True, key='scyan_pop', among=None)

Computes the ratio of cells per population. This ratio can be provided for each patient (or for any kind of 'group').

Parameters:

Name Type Description Default
adata AnnData

An AnnData object.

required
groupby Union[str, List[str], None]

Key(s) of adata.obs used to create groups (e.g. the patient ID).

None
normalize bool

If False, returns counts instead of ratios. If "%", use percentage instead of ratios in [0, 1];

True
key str

Key of adata.obs containing the population names (or the values to count).

'scyan_pop'
among str

Key of adata.obs containing the parent population name. Typically, if using hierarchical populations, you can provide 'scyan_pop_level' with your level name. E.g., if the parent of population of "T CD4 RM" is called "T cells" in adata.obs[among], then this function computes the 'T CD4 RM ratio among T cells'.

None

Returns:

Type Description
DataFrame

A DataFrame of ratios or counts (one row per group, one column per population). If normalize=False, then each row sums to 1 (for among=None).

Source code in scyan/tools/biomarkers.py
def cell_type_ratios(
    adata: AnnData,
    groupby: Union[str, List[str], None] = None,
    normalize: bool = True,
    key: str = "scyan_pop",
    among: str = None,
) -> pd.DataFrame:
    """Computes the ratio of cells per population. This ratio can be provided for each patient (or for any kind of 'group').

    Args:
        adata: An `AnnData` object.
        groupby: Key(s) of `adata.obs` used to create groups (e.g. the patient ID).
        normalize: If `False`, returns counts instead of ratios. If `"%"`, use percentage instead of ratios in `[0, 1]`;
        key: Key of `adata.obs` containing the population names (or the values to count).
        among: Key of `adata.obs` containing the parent population name. Typically, if using hierarchical populations, you can provide `'scyan_pop_level'` with your level name. E.g., if the parent of population of "T CD4 RM" is called "T cells" in `adata.obs[among]`, then this function computes the 'T CD4 RM ratio among T cells'.

    Returns:
        A DataFrame of ratios or counts (one row per group, one column per population). If `normalize=False`, then each row sums to 1 (for `among=None`).
    """
    assert (
        among is None or normalize
    ), "If 'among' is `None`, then normalize can't be `False`"

    column_suffix = (
        ("percentage" if normalize == "%" else "ratio") if normalize else "count"
    )

    counts = _get_counts(adata, groupby, key, normalize)

    if among is None:
        counts.columns = [f"{name} {column_suffix}" for name in counts.columns]
        return counts.mul(100) if normalize == "%" else counts

    parents_count = _get_counts(adata, groupby, among, normalize)

    df_parent = adata.obs.groupby(among)[key].apply(lambda s: s.value_counts()).unstack()
    assert (
        (df_parent > 0).sum(0) <= 1
    ).all(), f"Each population from adata.obs['{key}'] should have only one parent population in adata.obs['{among}']"
    to_parent_dict = dict(df_parent.idxmax())

    counts /= parents_count[[to_parent_dict[pop] for pop in counts.columns]].values
    counts.columns = [
        f"{pop} {column_suffix} among {to_parent_dict[pop]}" for pop in counts.columns
    ]
    return counts.mul(100) if normalize == "%" else counts

scyan.tools.mean_intensities(adata, groupby=None, layer=None, key='scyan_pop', unstack_join=' mean intensity on ', obsm=None, obsm_names=None)

Compute the Mean Metal Intensity (MMI) or Mean Fluorescence Intensity (MFI) per population. If needed, mean intensities can be computed per group (e.g., per patient) by providing the groupby argument.

Parameters:

Name Type Description Default
adata AnnData

An AnnData object.

required
groupby Union[str, List[str], None]

Key(s) of adata.obs used to create groups. For instance, "id" computes MMI per population for each ID. You can also provide something like ["group", "id"] to get MMI per group, and per patient inside each group.

None
layer Optional[str]

In which adata.layers we get expression intensities. By default, it uses adata.X.

None
key str

Key of adata.obs containing the population names.

'scyan_pop'
unstack_join Optional[str]

If None, keep the information grouped. Else, flattens the biomarkers into one series (or one row per group if groupby is a list) and uses unstack_join to join the names of the multi-level columns. For instance, ' expression on ' can be a good choice.

' mean intensity on '
obsm Optional[str]

In which adata.obsm we get expression intensities. By default, it uses adata.X. If not None then obsm_names is required too.

None
obsm_names Optional[List[str]]

Ordered list of names in adata.obsm[obsm] if obsm was provided.

None

Returns:

Type Description
DataFrame

A DataFrame of MFI. If groupby was a list, it is a multi-index dataframe.

Source code in scyan/tools/biomarkers.py
def mean_intensities(
    adata: AnnData,
    groupby: Union[str, List[str], None] = None,
    layer: Optional[str] = None,
    key: str = "scyan_pop",
    unstack_join: Optional[str] = " mean intensity on ",
    obsm: Optional[str] = None,
    obsm_names: Optional[List[str]] = None,
) -> pd.DataFrame:
    """Compute the Mean Metal Intensity (MMI) or Mean Fluorescence Intensity (MFI) per population. If needed, mean intensities can be computed per group (e.g., per patient) by providing the `groupby` argument.

    Args:
        adata: An `AnnData` object.
        groupby: Key(s) of `adata.obs` used to create groups. For instance, `"id"` computes MMI per population for each ID. You can also provide something like `["group", "id"]` to get MMI per group, and per patient inside each group.
        layer: In which `adata.layers` we get expression intensities. By default, it uses `adata.X`.
        key: Key of `adata.obs` containing the population names.
        unstack_join: If `None`, keep the information grouped. Else, flattens the biomarkers into one series (or one row per group if `groupby` is a list) and uses `unstack_join` to join the names of the multi-level columns. For instance, `' expression on '` can be a good choice.
        obsm: In which `adata.obsm` we get expression intensities. By default, it uses `adata.X`. If not `None` then `obsm_names` is required too.
        obsm_names: Ordered list of names in `adata.obsm[obsm]` if `obsm` was provided.

    Returns:
        A DataFrame of MFI. If `groupby` was a list, it is a multi-index dataframe.
    """
    if groupby is None:
        groupby = [key]
    elif isinstance(groupby, str):
        groupby = [groupby, key]
    else:
        groupby = list(groupby) + [key]

    if obsm is not None:
        assert (
            layer is None
        ), "You must choose between 'obsm' and 'layer', do not use both."

        df = pd.DataFrame(data=adata.obsm[obsm], columns=obsm_names)
    else:
        df = adata.to_df(layer)

    for group in groupby:
        df[group] = adata.obs[group].values

    res = df.groupby(groupby).mean().dropna(how="all")

    if res.values.min() < 0:
        log.warning(
            "The minimum expression value is negative. Are you sure you are using unscaled values? If not, you can use 'scyan.preprocess.unscale' and save the unscaled result in a 'adata.layers' of your choice (then use this layer argument in the current function). If you know what you are doing, or if you use flow cytometry data, you can ignore this warning."
        )

    if unstack_join is None:
        return res

    res = res.unstack(level=-1)
    if isinstance(res, pd.Series):
        res.index = [unstack_join.join(row).strip() for row in res.index.values]
    else:
        res.columns = [unstack_join.join(col).strip() for col in res.columns.values]
    return res

scyan.tools.PolygonGatingUMAP

Class used to select cells on a UMAP using polygons.

Note

If used on a Jupyter Notebook, you should first run %matplotlib tk. After the selection, you can run %matplotlib inline to retrieve the default behavior.

# Usage example (`%matplotlib tk` is required for the cell selection on jupyter notebooks)
>>> %matplotlib tk
>>> selector = scyan.tools.PolygonGatingUMAP(adata)
>>> selector.select()         # select the cells

>>> sub_adata = selector.extract_adata() # on a notebook, this has to be on a new jupyter cell
Source code in scyan/tools/gating.py
class PolygonGatingUMAP:
    """Class used to select cells on a UMAP using polygons.

    !!! note

        If used on a Jupyter Notebook, you should first run `%matplotlib tk`. After the selection, you can run `%matplotlib inline` to retrieve the default behavior.

    ```py
    # Usage example (`%matplotlib tk` is required for the cell selection on jupyter notebooks)
    >>> %matplotlib tk
    >>> selector = scyan.tools.PolygonGatingUMAP(adata)
    >>> selector.select()         # select the cells

    >>> sub_adata = selector.extract_adata() # on a notebook, this has to be on a new jupyter cell
    ```
    """

    def __init__(self, adata: AnnData) -> None:
        """
        Args:
            adata: An `AnnData` object.
        """
        self.adata = adata
        self.has_umap = _has_umap(adata)
        self.x_umap = self.adata.obsm["X_umap"]

    def select(self, s: float = 0.05) -> None:
        """Open a UMAP plot on which you can draw a polygon to select cells.

        Args:
            s: Size of the cells on the plot.
        """
        _, ax = plt.subplots()

        pts = ax.scatter(
            self.x_umap[self.has_umap, 0],
            self.x_umap[self.has_umap, 1],
            marker=".",
            rasterized=True,
            s=s,
        )

        self.selector = _SelectFromCollection(ax, pts, self.x_umap[self.has_umap])

        log.info(
            f"Enclose cells within a polygon. Helper:\n    - Click on the plot to add a polygon vertex\n    - Press the 'esc' key to start a new polygon\n    - Try holding the 'ctrl' key to move a single vertex\n    - Once the polygon is finished and overlaid in red, you can close the window"
        )
        plt.show()

    def save_selection(self, key_added: str = "scyan_selected"):
        """Save the selected cells in `adata.obs[key_added]`.

        Args:
            key_added: Column name used to save the selected cells in `adata.obs`.
        """
        self.adata.obs[key_added] = "unselected"
        col_index = self.adata.obs.columns.get_loc(key_added)
        self.adata.obs.iloc[
            np.where(self.has_umap)[0][self.selector.ind], col_index
        ] = "selected"
        self.adata.obs[key_added] = self.adata.obs[key_added].astype("category")

        self.selector.disconnect()
        log.info(
            f"Selected {len(self.selector.ind)} cells and saved the selection in adata.obs['{key_added}']"
        )

    def extract_adata(self) -> AnnData:
        """Returns an anndata objects whose cells where inside the polygon"""
        log.info(f"Selected {len(self.selector.ind)} cells")
        self.selector.disconnect()

        return self.adata[np.where(self.has_umap)[0][self.selector.ind]]

__init__(adata)

Parameters:

Name Type Description Default
adata AnnData

An AnnData object.

required
Source code in scyan/tools/gating.py
def __init__(self, adata: AnnData) -> None:
    """
    Args:
        adata: An `AnnData` object.
    """
    self.adata = adata
    self.has_umap = _has_umap(adata)
    self.x_umap = self.adata.obsm["X_umap"]

select(s=0.05)

Open a UMAP plot on which you can draw a polygon to select cells.

Parameters:

Name Type Description Default
s float

Size of the cells on the plot.

0.05
Source code in scyan/tools/gating.py
def select(self, s: float = 0.05) -> None:
    """Open a UMAP plot on which you can draw a polygon to select cells.

    Args:
        s: Size of the cells on the plot.
    """
    _, ax = plt.subplots()

    pts = ax.scatter(
        self.x_umap[self.has_umap, 0],
        self.x_umap[self.has_umap, 1],
        marker=".",
        rasterized=True,
        s=s,
    )

    self.selector = _SelectFromCollection(ax, pts, self.x_umap[self.has_umap])

    log.info(
        f"Enclose cells within a polygon. Helper:\n    - Click on the plot to add a polygon vertex\n    - Press the 'esc' key to start a new polygon\n    - Try holding the 'ctrl' key to move a single vertex\n    - Once the polygon is finished and overlaid in red, you can close the window"
    )
    plt.show()

save_selection(key_added='scyan_selected')

Save the selected cells in adata.obs[key_added].

Parameters:

Name Type Description Default
key_added str

Column name used to save the selected cells in adata.obs.

'scyan_selected'
Source code in scyan/tools/gating.py
def save_selection(self, key_added: str = "scyan_selected"):
    """Save the selected cells in `adata.obs[key_added]`.

    Args:
        key_added: Column name used to save the selected cells in `adata.obs`.
    """
    self.adata.obs[key_added] = "unselected"
    col_index = self.adata.obs.columns.get_loc(key_added)
    self.adata.obs.iloc[
        np.where(self.has_umap)[0][self.selector.ind], col_index
    ] = "selected"
    self.adata.obs[key_added] = self.adata.obs[key_added].astype("category")

    self.selector.disconnect()
    log.info(
        f"Selected {len(self.selector.ind)} cells and saved the selection in adata.obs['{key_added}']"
    )

extract_adata()

Returns an anndata objects whose cells where inside the polygon

Source code in scyan/tools/gating.py
def extract_adata(self) -> AnnData:
    """Returns an anndata objects whose cells where inside the polygon"""
    log.info(f"Selected {len(self.selector.ind)} cells")
    self.selector.disconnect()

    return self.adata[np.where(self.has_umap)[0][self.selector.ind]]

scyan.tools.PolygonGatingScatter

Class used to select cells on a scatterplot using polygons.

Note

If used on a Jupyter Notebook, you should first run %matplotlib tk. After the selection, you can run %matplotlib inline to retrieve the default behavior.

# Usage example (`%matplotlib tk` is required for the cell selection on jupyter notebooks)
>>> %matplotlib tk
>>> selector = scyan.tools.PolygonGatingScatter(adata)
>>> selector.select()         # select the cells

>>> sub_adata = selector.extract_adata() # on a notebook, this has to be on a new jupyter cell
Source code in scyan/tools/gating.py
class PolygonGatingScatter:
    """Class used to select cells on a scatterplot using polygons.

    !!! note

        If used on a Jupyter Notebook, you should first run `%matplotlib tk`. After the selection, you can run `%matplotlib inline` to retrieve the default behavior.

    ```py
    # Usage example (`%matplotlib tk` is required for the cell selection on jupyter notebooks)
    >>> %matplotlib tk
    >>> selector = scyan.tools.PolygonGatingScatter(adata)
    >>> selector.select()         # select the cells

    >>> sub_adata = selector.extract_adata() # on a notebook, this has to be on a new jupyter cell
    ```
    """

    def __init__(self, adata: AnnData) -> None:
        """
        Args:
            adata: An `AnnData` object.
        """
        self.adata = adata

    def select(
        self, x: str, y: str, s: float = 0.05, max_cells_display: int = 100_000
    ) -> None:
        """Open a scatter plot on which you can draw a polygon to select cells.

        Args:
            x: Column name of adata.obs used for the x-axis
            y: Column name of adata.obs used for the y-axis
            s: Size of the cells on the plot.
        """
        _, ax = plt.subplots()

        indices = np.arange(self.adata.n_obs)
        if max_cells_display is not None and max_cells_display < self.adata.n_obs:
            indices = np.random.choice(
                np.arange(self.adata.n_obs), size=max_cells_display, replace=False
            )

        x = self.adata.obs_vector(x)
        y = self.adata.obs_vector(y)
        xy = np.stack([x, y], axis=1)

        pts = ax.scatter(
            xy[indices, 0],
            xy[indices, 1],
            marker=".",
            rasterized=True,
            s=s,
        )

        self.selector = _SelectFromCollection(ax, pts, xy)

        log.info(
            f"Enclose cells within a polygon. Helper:\n    - Click on the plot to add a polygon vertex\n    - Press the 'esc' key to start a new polygon\n    - Try holding the 'ctrl' key to move a single vertex\n    - Once the polygon is finished and overlaid in red, you can close the window"
        )
        plt.show()

    def save_selection(self, key_added: str = "scyan_selected"):
        """Save the selected cells in `adata.obs[key_added]`.

        Args:
            key_added: Column name used to save the selected cells in `adata.obs`.
        """
        self.adata.obs[key_added] = "unselected"
        col_index = self.adata.obs.columns.get_loc(key_added)
        self.adata.obs.iloc[self.selector.ind, col_index] = "selected"
        self.adata.obs[key_added] = self.adata.obs[key_added].astype("category")

        self.selector.disconnect()
        log.info(
            f"Selected {len(self.selector.ind)} cells and saved the selection in adata.obs['{key_added}']"
        )

    def extract_adata(self) -> AnnData:
        """Returns an anndata objects whose cells where inside the polygon"""
        log.info(f"Selected {len(self.selector.ind)} cells")
        self.selector.disconnect()

        return self.adata[self.selector.ind]

__init__(adata)

Parameters:

Name Type Description Default
adata AnnData

An AnnData object.

required
Source code in scyan/tools/gating.py
def __init__(self, adata: AnnData) -> None:
    """
    Args:
        adata: An `AnnData` object.
    """
    self.adata = adata

select(x, y, s=0.05, max_cells_display=100000)

Open a scatter plot on which you can draw a polygon to select cells.

Parameters:

Name Type Description Default
x str

Column name of adata.obs used for the x-axis

required
y str

Column name of adata.obs used for the y-axis

required
s float

Size of the cells on the plot.

0.05
Source code in scyan/tools/gating.py
def select(
    self, x: str, y: str, s: float = 0.05, max_cells_display: int = 100_000
) -> None:
    """Open a scatter plot on which you can draw a polygon to select cells.

    Args:
        x: Column name of adata.obs used for the x-axis
        y: Column name of adata.obs used for the y-axis
        s: Size of the cells on the plot.
    """
    _, ax = plt.subplots()

    indices = np.arange(self.adata.n_obs)
    if max_cells_display is not None and max_cells_display < self.adata.n_obs:
        indices = np.random.choice(
            np.arange(self.adata.n_obs), size=max_cells_display, replace=False
        )

    x = self.adata.obs_vector(x)
    y = self.adata.obs_vector(y)
    xy = np.stack([x, y], axis=1)

    pts = ax.scatter(
        xy[indices, 0],
        xy[indices, 1],
        marker=".",
        rasterized=True,
        s=s,
    )

    self.selector = _SelectFromCollection(ax, pts, xy)

    log.info(
        f"Enclose cells within a polygon. Helper:\n    - Click on the plot to add a polygon vertex\n    - Press the 'esc' key to start a new polygon\n    - Try holding the 'ctrl' key to move a single vertex\n    - Once the polygon is finished and overlaid in red, you can close the window"
    )
    plt.show()

save_selection(key_added='scyan_selected')

Save the selected cells in adata.obs[key_added].

Parameters:

Name Type Description Default
key_added str

Column name used to save the selected cells in adata.obs.

'scyan_selected'
Source code in scyan/tools/gating.py
def save_selection(self, key_added: str = "scyan_selected"):
    """Save the selected cells in `adata.obs[key_added]`.

    Args:
        key_added: Column name used to save the selected cells in `adata.obs`.
    """
    self.adata.obs[key_added] = "unselected"
    col_index = self.adata.obs.columns.get_loc(key_added)
    self.adata.obs.iloc[self.selector.ind, col_index] = "selected"
    self.adata.obs[key_added] = self.adata.obs[key_added].astype("category")

    self.selector.disconnect()
    log.info(
        f"Selected {len(self.selector.ind)} cells and saved the selection in adata.obs['{key_added}']"
    )

extract_adata()

Returns an anndata objects whose cells where inside the polygon

Source code in scyan/tools/gating.py
def extract_adata(self) -> AnnData:
    """Returns an anndata objects whose cells where inside the polygon"""
    log.info(f"Selected {len(self.selector.ind)} cells")
    self.selector.disconnect()

    return self.adata[self.selector.ind]