Skip to content

IO (read/write)

scyan.read_fcs(path, marker_regex='^cd|^hla|epcam|^ccr', exclude_markers=None, channel_suffix='S')

Read a FCS file and return an AnnData object.

Parameters:

Name Type Description Default
path str

Path to the FCS file that has to be read.

required
marker_regex str

Regex used to find which columns correspond to markers. By default, it targets strings that starts with CD, HLA, CCR, or EPCAM. You can add names to the regex by adding the lowercase marker name after a new | in the string

'^cd|^hla|epcam|^ccr'
exclude_markers Optional[List[str]]

Optional list of channel names that has to be considered as an observation (i.e., inside adata.obs), among the ones that were automatically classified as markers (i.e., inside adata.var_names).

None
channel_suffix Optional[str]

Suffix for the channel naming convention, i.e. "S" for "PnS", or "N" for "PnN". If None, keep the raw names.

'S'

Returns:

Type Description
AnnData

AnnData object containing the FCS data.

Source code in scyan/_io.py
def read_fcs(
    path: str,
    marker_regex: str = "^cd|^hla|epcam|^ccr",
    exclude_markers: Optional[List[str]] = None,
    channel_suffix: Optional[str] = "S",
) -> AnnData:
    """Read a FCS file and return an `AnnData` object.

    Args:
        path: Path to the FCS file that has to be read.
        marker_regex: Regex used to find which columns correspond to markers. By default, it targets strings that starts with `CD`, `HLA`, `CCR`, or `EPCAM`. You can add names to the regex by adding the lowercase marker name after a new `|` in the string
        exclude_markers: Optional list of channel names that has to be considered as an observation (i.e., inside `adata.obs`), among the ones that were automatically classified as markers (i.e., inside `adata.var_names`).
        channel_suffix: Suffix for the channel naming convention, i.e. `"S"` for "PnS", or `"N"` for "PnN". If `None`, keep the raw names.

    Returns:
        `AnnData` object containing the FCS data.
    """
    meta, data = fcsparser.parse(path)

    names = pd.Series(
        [meta.get(f"$P{i + 1}{channel_suffix}") for i in range(data.shape[1])]
    )
    fallback_names = pd.Series([meta[f"$P{i + 1}N"] for i in range(data.shape[1])])
    data.columns = np.where(names.isna() | names.duplicated(False), fallback_names, names)

    exclude_markers = _check_exlude_markers(data, exclude_markers)
    is_marker = data.columns.str.lower().str.contains(marker_regex) & ~np.isin(
        data.columns, exclude_markers
    )

    adata = AnnData(
        X=data.loc[:, is_marker].values.astype(np.float32),
        var=pd.DataFrame(index=data.columns[is_marker]),
        obs=data.loc[:, ~is_marker],
    )

    if "$SPILLOVER" in meta:
        df_spillover = _read_spillover_matrix(meta["$SPILLOVER"])
        fallback_var_names = fallback_names[is_marker].values
        is_in = np.isin(fallback_var_names, df_spillover.index)
        if is_in.all():
            adata.varp["spillover_matrix"] = df_spillover.loc[
                fallback_var_names, fallback_var_names
            ]
        else:
            log.warn(
                f"Missing var names inside spillover matrix: {fallback_var_names[~is_in]}. The spillover matrix will be saved in adata.uns instead of adata.varp"
            )
            adata.uns["spillover_matrix"] = df_spillover

    return adata

scyan.read_csv(path, marker_regex='^cd|^hla|epcam|^ccr', exclude_markers=None, **pandas_kwargs)

Read a CSV file and return an AnnData object.

Note

It tries to infer which columns are markers by checking which columns contain one of these: CD, HLA, CCR, EPCAM, CADM, SIGLEC. Though, if it didn't select the right markers, you can help it by providing extra_marker_names or remove_marker_names.

Parameters:

Name Type Description Default
path str

Path to the CSV file that has to be read.

required
marker_regex str

Regex used to find which columns correspond to markers. By default, it targets strings that starts with CD, HLA, CCR, or EPCAM. You can add names to the regex by adding the lowercase marker name after a new | in the string

'^cd|^hla|epcam|^ccr'
exclude_markers Optional[List[str]]

Optional list of channel names that has to be considered as an observation (i.e., inside adata.obs), among the ones that were automatically classified as markers (i.e., inside adata.var_names).

None
**pandas_kwargs int

Optional kwargs for pandas.read_csv(...).

{}

Returns:

Type Description
AnnData

AnnData object containing the CSV data.

Source code in scyan/_io.py
def read_csv(
    path: str,
    marker_regex: str = "^cd|^hla|epcam|^ccr",
    exclude_markers: Optional[List[str]] = None,
    **pandas_kwargs: int,
) -> AnnData:
    """Read a CSV file and return an `AnnData` object.

    !!! note
        It tries to infer which columns are markers by checking which columns contain one of these: CD, HLA, CCR, EPCAM, CADM, SIGLEC. Though, if it didn't select the right markers, you can help it by providing `extra_marker_names` or `remove_marker_names`.

    Args:
        path: Path to the CSV file that has to be read.
        marker_regex: Regex used to find which columns correspond to markers. By default, it targets strings that starts with `CD`, `HLA`, `CCR`, or `EPCAM`. You can add names to the regex by adding the lowercase marker name after a new `|` in the string
        exclude_markers: Optional list of channel names that has to be considered as an observation (i.e., inside `adata.obs`), among the ones that were automatically classified as markers (i.e., inside `adata.var_names`).
        **pandas_kwargs: Optional kwargs for `pandas.read_csv(...)`.

    Returns:
        `AnnData` object containing the CSV data.
    """
    df = pd.read_csv(path, **pandas_kwargs)

    exclude_markers = _check_exlude_markers(df, exclude_markers)
    is_marker = df.columns.str.lower().str.contains(marker_regex) & ~np.isin(
        df.columns, exclude_markers
    )
    return AnnData(df.loc[:, is_marker], obs=df.loc[:, ~is_marker], dtype=np.float32)

scyan.write_fcs(adata, path, layer=None, columns_to_numeric=None, **fcswrite_kwargs)

Based on a AnnData object, it writes a FCS file that contains (i) all the markers intensities, (ii) every numeric column of adata.obs, and (iii) all adata.obsm variables.

Note

As the FCS format doesn't support strings, some observations will not be kept in the FCS file.

Parameters:

Name Type Description Default
adata AnnData

AnnData object to save.

required
path str

Path to write the file.

required
layer Optional[str]

Name of the adata layer from which intensities will be extracted. If None, uses adata.X.

None
columns_to_numeric Optional[List]

List of non-numerical column names from adata.obs that should be kept, by transforming them into integers. Note that you don't need to list the numerical columns, that are written inside the FCS by default.

None
**fcswrite_kwargs int

Optional kwargs provided to fcswrite.write_fcs.

{}

Returns:

Type Description
Union[None, Dict]

If columns_to_numeric is None, returns nothing. Else, return a dict whose keys are the observation column names being transformed, and the values are ordered lists of the label encoded classes. E.g., {"batch": ["b1", "b2"]} means that the batch "b1" was encoded by 0, and "b2" by 1.

Source code in scyan/_io.py
def write_fcs(
    adata: AnnData,
    path: str,
    layer: Optional[str] = None,
    columns_to_numeric: Optional[List] = None,
    **fcswrite_kwargs: int,
) -> Union[None, Dict]:
    """Based on a `AnnData` object, it writes a FCS file that contains (i) all the markers intensities, (ii) every numeric column of `adata.obs`, and (iii) all `adata.obsm` variables.

    !!! note
        As the FCS format doesn't support strings, some observations will not be kept in the FCS file.

    Args:
        adata: `AnnData` object to save.
        path: Path to write the file.
        layer: Name of the `adata` layer from which intensities will be extracted. If `None`, uses `adata.X`.
        columns_to_numeric: List of **non-numerical** column names from `adata.obs` that should be kept, by transforming them into integers. Note that you don't need to list the numerical columns, that are written inside the FCS by default.
        **fcswrite_kwargs: Optional kwargs provided to `fcswrite.write_fcs`.

    Returns:
        If `columns_to_numeric` is `None`, returns nothing. Else, return a dict whose keys are the observation column names being transformed, and the values are ordered lists of the label encoded classes. E.g., `{"batch": ["b1", "b2"]}` means that the batch `"b1"` was encoded by 0, and `"b2"` by 1.
    """
    df = _to_df(adata, layer)
    dict_classes = {}
    columns_removed = []

    for column in df.columns:
        if df[column].dtype == "bool":
            df[column] = df[column].astype(int).values
            continue
        if is_numeric_dtype(df[column].dtype):
            continue
        try:
            df[column] = pd.to_numeric(df[column].values)
            continue
        except:
            from sklearn.preprocessing import LabelEncoder

            if columns_to_numeric is not None and column in columns_to_numeric:
                le = LabelEncoder()
                df[column] = le.fit_transform(df[column].values)
                dict_classes[column] = list(le.classes_)
            else:
                del df[column]
                columns_removed.append(column)

    log.info(f"Found {len(df.columns)} features: {', '.join(df.columns)}.")
    if columns_removed:
        log.warning(
            f"FCS does not support strings, so the following columns where removed: {', '.join(columns_removed)}.\nIf you want to keep these str observations, use the 'columns_to_numeric' argument to encod them."
        )

    fcswrite.write_fcs(str(path), list(df.columns), df.values, **fcswrite_kwargs)

    if columns_to_numeric is not None:
        return dict_classes

scyan.write_csv(adata, path, layer=None)

Based on a AnnData object, it writes a CSV file that contains (i) all the markers intensities, (ii) every numeric column of adata.obs, and (iii) all adata.obsm variables.

Parameters:

Name Type Description Default
adata AnnData

AnnData object to save.

required
path str

Path to write the file.

required
layer Optional[str]

Name of the adata layer from which intensities will be extracted. If None, uses adata.X.

None
Source code in scyan/_io.py
def write_csv(
    adata: AnnData,
    path: str,
    layer: Optional[str] = None,
) -> Union[None, Dict]:
    """Based on a `AnnData` object, it writes a CSV file that contains (i) all the markers intensities, (ii) every numeric column of `adata.obs`, and (iii) all `adata.obsm` variables.

    Args:
        adata: `AnnData` object to save.
        path: Path to write the file.
        layer: Name of the `adata` layer from which intensities will be extracted. If `None`, uses `adata.X`.
    """
    _to_df(adata, layer).to_csv(path)