Source code for ALLCools.clustering.mcad

import numpy as np
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import warnings
from pybedtools import BedTool


[docs]def remove_black_list_region(adata, black_list_path, f=0.2):
    """
    Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path

    Parameters
    ----------
    adata
    black_list_path
        Path to the black list bed file
    f
        Fraction of overlap when calling bedtools intersect
    Returns
    -------
    None
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        feature_bed_df = adata.var[["chrom", "start", "end"]]
        feature_bed = BedTool.from_dataframe(feature_bed_df)
        black_list_bed = BedTool(black_list_path)
        black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True)
        try:
            black_feature_index = (
                black_feature.to_dataframe().set_index(["chrom", "start", "end"]).index
            )
            black_feature_id = pd.Index(
                feature_bed_df.reset_index()
                .set_index(["chrom", "start", "end"])
                .loc[black_feature_index][feature_bed_df.index.name]
            )
            print(
                f"{black_feature_id.size} features removed due to overlapping"
                f" (bedtools intersect -f {f}) with black list regions."
            )
            adata._inplace_subset_var(~adata.var_names.isin(black_feature_id))
        except pd.errors.EmptyDataError:
            # no overlap with black list
            pass
    return


[docs]def binarize_matrix(adata, cutoff=0.95):
    """
    Binarize adata.X with adata.X > cutoff

    Parameters
    ----------
    adata
        AnnData object whose X is survival function matrix
    cutoff
        Cutoff to binarize the survival function

    Returns
    -------
    None
    """
    adata.X = (adata.X > cutoff).astype(np.int8)
    return


[docs]def filter_regions(adata, hypo_percent=0.5):
    """
    Filter regions based on % of cells having non-zero scores.

    Parameters
    ----------
    adata
    hypo_percent
        min % of cells that are non-zero in this region.

    Returns
    -------

    """
    n_cell = int(adata.shape[0] * hypo_percent / 100)
    hypo_judge = (adata.X > 0).sum(axis=0).A1 > n_cell
    adata._inplace_subset_var(hypo_judge)
    print(f'{hypo_judge.sum()} regions remained, with # of non-zero cells > {n_cell}.')
    return


[docs]def tf_idf(data, scale_factor):
    col_sum = data.sum(axis=0).A1
    row_sum = data.sum(axis=1).A1

    idf = np.log(1 + data.shape[0] / col_sum)
    tf = data
    tf.data = tf.data / np.repeat(row_sum, row_sum)
    tf.data = np.log(tf.data * scale_factor + 1)
    tf = tf.multiply(idf)
    return tf


[docs]def lsi(
    adata,
    scale_factor=100000,
    n_components=100,
    algorithm="arpack",
    obsm="X_pca",
    random_state=0,
    fit_size=None,
):
    """
    Run TF-IDF on the binarized adata.X, followed by TruncatedSVD and then scale the components by svd.singular_values_

    Parameters
    ----------
    adata

    scale_factor
    n_components
    algorithm
    obsm
    random_state
    fit_size
        Ratio or absolute int value, use to downsample when fitting the SVD to speed up run time.

    Returns
    -------

    """
    # tf-idf
    data = adata.X.astype(np.int8).copy()
    tf = tf_idf(data, scale_factor)
    n_rows, n_cols = tf.shape
    n_components = min(n_rows, n_cols, n_components)
    svd = TruncatedSVD(
        n_components=n_components, algorithm=algorithm, random_state=random_state
    )

    if fit_size is None:
        # fit the SVD using all rows
        matrix_reduce = svd.fit_transform(tf)
    elif fit_size >= n_rows:
        # fit size is larger than actual data size
        matrix_reduce = svd.fit_transform(tf)
    else:
        # fit the SVD using partial rows to speed up
        if fit_size < 1:
            fit_size = max(int(n_rows * fit_size), n_components)
        use_cells = (
            pd.Series(range(n_rows))
            .sample(fit_size, random_state=random_state)
            .sort_index()
            .tolist()
        )
        svd.fit(tf.tocsr()[use_cells, :])
        matrix_reduce = svd.transform(tf)

    matrix_reduce = matrix_reduce / svd.singular_values_

    # PCA is the default name for many following steps in scanpy, use the name here for convenience.
    # However, this is not PCA
    adata.obsm[obsm] = matrix_reduce
    return svd