Source code for ALLCools.clustering.feature_selection.feature_enrichment

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.stats.multitest import multipletests


[docs]def _calculate_enrichment_score(raw_adata, labels): """ Enrichment score modified from :cite:p:`Zeisel2018` for normalized methylation fractions Assuming the methylation value is posterior frac calculated by MCDS.add_mc_frac) """ n_cells = raw_adata.shape[0] total_hypo_n = pd.Series(np.ravel((raw_adata.X < 1).sum(axis=0)).copy(), index=raw_adata.var_names) total_sum = pd.Series(np.ravel(raw_adata.X.sum(axis=0).copy()), index=raw_adata.var_names) enrichment = [] label_orders = [] for label, sub_df in raw_adata.obs.groupby(labels): in_cells = sub_df.shape[0] out_cells = n_cells - in_cells if in_cells < 2: continue label_orders.append(label) # hypo-methylated cells in the cluster in_hypo_n = pd.Series(np.ravel( (raw_adata[sub_df.index, :].X < 1).sum(axis=0)).copy(), index=raw_adata.var_names) in_hypo_frac = in_hypo_n / in_cells # hypo-methylated cells not in the cluster out_hypo_n = total_hypo_n - in_hypo_n out_hypo_frac = out_hypo_n / out_cells # mean of cells in the cluster in_mean = pd.Series(np.ravel( raw_adata[sub_df.index, :].X.mean(axis=0)).copy(), index=raw_adata.var_names) # mean of cells not in the cluster out_mean = (total_sum - in_mean * in_cells) / out_cells # finally, the enrichment score # In Zeisel et al., the frac and mean change on the same direction, # but in the mc frac case, larger fraction means smaller mean # so the fraction part is reversed e = ((in_hypo_frac + 0.1) / (out_hypo_frac + 0.1) * (out_mean + 0.01) / (in_mean + 0.01)) enrichment.append(e) # enrichment > 1, the gene is hypo-methylated in that cluster, the larger the better enrichment = pd.DataFrame(enrichment, index=label_orders).loc[:, raw_adata.var_names].copy() return enrichment
[docs]def _calculate_enrichment_score_cytograph(adata, labels): """ The original CEF algorithm from :cite:p:`Zeisel2018` for count based data (RNA, ATAC) """ n_cells = adata.shape[0] clusters = [] nnz = [] nnz_other = [] means = [] means_other = [] sizes = [] # Number of cells per cluster total_nnz = np.ravel((adata.X > 0).sum(axis=0)) total_sum = np.ravel(adata.X.sum(axis=0)) total_cells = adata.shape[0] for cluster, sub_df in adata.obs.groupby(labels): clusters.append(cluster) this_cells = sub_df.shape[0] sizes.append(this_cells) is_in_cluster = adata.obs_names.isin(sub_df.index) # Number of nonzero values per cluster and other cells this_nnz = np.ravel((adata[is_in_cluster, :].X > 0).sum(axis=0)) nnz.append(this_nnz) other_nnz = total_nnz - this_nnz nnz_other.append(other_nnz) # Mean value per cluster this_mean = np.ravel(adata[is_in_cluster, :].X.mean(axis=0)) means.append(this_mean) other_mean = (total_sum - this_mean * this_cells) / \ (total_cells - this_cells) means_other.append(other_mean) nnz = np.vstack(nnz).astype(np.float64) nnz_other = np.vstack(nnz_other).astype(np.float64) means = np.vstack(means) means_other = np.vstack(means_other) sizes = np.array(sizes) # turn nnz count into fractions _sizes = sizes[:, None] _sizes_other = n_cells - _sizes nnz /= _sizes nnz_other /= _sizes_other enrichment = ((nnz + 0.1) / (nnz_other + 0.1) * (means + 0.01) / (means_other + 0.01)) enrichment = pd.DataFrame(enrichment, index=clusters, columns=adata.var_names) return enrichment
[docs]def _plot_enrichment_result(qvals, enrichment, null_enrichment, alpha): """Make some plots for the p-value, q-value and # of CEFs distribution""" fig, (f_ax, q_ax, e_ax) = plt.subplots(ncols=3, figsize=(15, 5)) # plot feature with q_value pass cutoff ax = f_ax sns.histplot((qvals < alpha).sum(axis=0), ax=ax) ax.set_xlabel("# of clusters") ax.set_title(f"Features enriched in # of clusters (q<{alpha})") sns.despine(ax=ax) # plot q_value ax = q_ax sns.histplot(np.ravel(qvals), ax=ax, bins=50) ax.set(yticks=[], ylabel="", xlabel="q value", title="q value distribution") sns.despine(ax=ax, left=True) # plot enrichment distribution ax = e_ax plot_data = pd.DataFrame({ "enrichment": enrichment.unstack().reset_index(drop=True), "null": null_enrichment.unstack().reset_index(drop=True), }) plot_data = plot_data.unstack().reset_index() plot_data.columns = ["Type", "_", "Enrichment Score"] sns.histplot( data=plot_data, x="Enrichment Score", hue="Type", ax=ax, bins=100, binrange=enrichment.unstack().dropna().quantile((0.0, 0.95)).tolist(), ) ax.set(yticks=[], ylabel="", title="Enrichment score distribution") sns.despine(ax=ax, left=True) fig.tight_layout() return
[docs]def _aggregate_enrichment(adata, enrichment, top_n, alpha, qvals, cluster_col): """Aggregate enrichment results, calculate q values""" # calculate use_features without qvals use_features_no_q = [] for _, row in enrichment.iterrows(): use_features_no_q.append( row.sort_values(ascending=False)[:top_n].dropna()) use_features_no_q = pd.concat(use_features_no_q).index.unique() # mask non-sig enrichment scores, calculate again enrichment[qvals > alpha] = np.nan use_features = [] for _, row in enrichment.iterrows(): use_features.append(row.sort_values(ascending=False)[:top_n].dropna()) use_features = pd.concat(use_features).index.unique() print(f"Selected {use_features.size} unique features") if len(use_features) == 0: print(f"No features found significantly enriched, " f"use top enriched features that did not pass q<{alpha}") use_features = use_features_no_q adata.uns[f"{cluster_col}_feature_enrichment"] = { "qvals": np.array(qvals).T, "enrichment": np.array(enrichment).T, "cluster_order": enrichment.index.tolist(), } return use_features
[docs]def cluster_enriched_features(adata: anndata.AnnData, cluster_col: str, top_n=200, alpha=0.05, stat_plot=True, method="mc"): """ Calculate top Cluster Enriched Features (CEF) from per-cell normalized dataset. An post-clustering feature selection step adapted from :cite:p:`Zeisel2018,La_Manno2021` and their great [cytograph2](https://github.com/linnarsson-lab/cytograph2) package. For details about CEF calculation, read the methods of :cite:p:`Zeisel2018`. Note that in original paper, they look for cluster-specific highly expressed genes as CEFs; for methylation, we are looking for hypo-methylation as CEFs, so the score and test is reversed. Parameters ---------- adata adata containing per-cell normalized values. For methylation fraction, the value need to be 1-centered (1 means cell's average methylation), like those produced by :func:`ALLCools.mcds.mcds.MCDS.add_mc_frac` with `normalize_per_cell=True`. For RNA and ATAC, you can use per cell normalized counts. Do not log transform the data before running this function cluster_col The name of categorical variable in adata.obs top_n Select top N CEFs for each cluster alpha FDR corrected q-value cutoff stat_plot Whether making some summary plots for the CEF calculation method "mc" for methylation CEF (look for hypo-methylation), "rna" and "atac" for the RNA and ATAC or any count based data (use the original cytograph algorithm, look for higher value) Returns ------- Modify adata inplace, adding a dictionary in adata.uns called f"{cluster_col}_feature_enrichment" The dictionary contains "qvals" (np.ndarray cluster-by-feature enrichment score q-value) and "cluster_order" (cluster order of the "qvals") """ n_labels = adata.obs[cluster_col].unique().size print(f"Found {n_labels} clusters to compute feature enrichment score") if method == "mc": use_func = _calculate_enrichment_score elif method == "rna": use_func = _calculate_enrichment_score_cytograph elif method == "atac": use_func = _calculate_enrichment_score_cytograph else: use_func = _calculate_enrichment_score_cytograph print( f'method needs to be in ["mc", "rna", "atac"], got {method}. Using the algorithm for RNA.' ) if n_labels == 1: print("No clusters detected, returning all features") use_features = adata.var_names.copy() else: print("Computing enrichment score") enrichment = use_func(adata, adata.obs[cluster_col]) # shuffle cluster label to calculate null dist null_label = adata.obs[cluster_col].sample(n=adata.shape[0], replace=False) null_label.index = adata.obs_names null_enrichment = use_func(adata, null_label) print("Computing enrichment score FDR-corrected P values") # FDR correction per row, row is cluster qvals = np.zeros_like(enrichment) for row in range(enrichment.shape[0]): null_values = null_enrichment.iloc[row, :].sort_values() values = enrichment.iloc[row, :] pvals = 1 - np.searchsorted(null_values, values) / values.shape[0] (_, q, _, _) = multipletests(pvals, alpha, method="fdr_bh") qvals[row, :] = q qvals = pd.DataFrame(qvals, index=enrichment.index, columns=enrichment.columns) if stat_plot: _plot_enrichment_result( qvals=qvals, enrichment=enrichment, null_enrichment=null_enrichment, alpha=alpha, ) use_features = _aggregate_enrichment( adata=adata, enrichment=enrichment, top_n=top_n, alpha=alpha, qvals=qvals, cluster_col=cluster_col, ) # save the calculated results in the input adata adata.var[f"{cluster_col}_enriched_features"] = adata.var_names.isin( use_features) return