Differential Methylated Genes - Pairwise
Contents
Differential Methylated Genes - Pairwise¶
import pandas as pd
import anndata
from ALLCools.mcds import MCDS
from ALLCools.clustering import PairwiseDMG
Parameters¶
adata_path = '../step_by_step/100kb/adata.with_coords.h5ad'
cluster_col = 'L1'
# change this to the paths to your MCDS files
obs_dim = 'cell'
var_dim = 'geneslop2k'
# DMG
mc_type = 'CHN'
top_n = 1000
adj_p_cutoff = 1e-3
delta_rate_cutoff = 0.3
auroc_cutoff = 0.9
random_state = 0
n_jobs = 30
Load¶
adata = anndata.read_h5ad(adata_path)
cell_meta = adata.obs.copy()
cell_meta.index.name = obs_dim
gene_meta = pd.read_csv(f'GeneMetadata.csv.gz', index_col=0)
gene_mcds = MCDS.open(f'geneslop2k_frac.mcds', use_obs=cell_meta.index)
gene_mcds
<xarray.MCDS> Dimensions: (cell: 16985, geneslop2k: 41871, mc_type: 2) Coordinates: * cell (cell) <U10 '10E_M_207' '10E_M_338' ... '9J_M_2969' * geneslop2k (geneslop2k) <U21 'ENSMUSG00000102693.1' ... 'ENSMUS... geneslop2k_chrom (geneslop2k) <U5 dask.array<chunksize=(41871,), meta=np.ndarray> geneslop2k_cov_mean (geneslop2k) float64 dask.array<chunksize=(41871,), meta=np.ndarray> geneslop2k_end (geneslop2k) int64 dask.array<chunksize=(41871,), meta=np.ndarray> geneslop2k_start (geneslop2k) int64 dask.array<chunksize=(41871,), meta=np.ndarray> * mc_type (mc_type) <U3 'CGN' 'CHN' strand_type <U4 'both' Data variables: geneslop2k_da_frac (cell, geneslop2k, mc_type) float32 dask.array<chunksize=(3397, 2463, 2), meta=np.ndarray> Attributes: obs_dim: cell var_dim: geneslop2k
Pairwise DMG¶
pwdmg = PairwiseDMG(max_cell_per_group=1000,
top_n=top_n,
adj_p_cutoff=adj_p_cutoff,
delta_rate_cutoff=delta_rate_cutoff,
auroc_cutoff=auroc_cutoff,
random_state=random_state,
n_jobs=n_jobs)
pwdmg.fit_predict(x=gene_mcds[f'{var_dim}_da_frac'].sel(mc_type=mc_type),
var_dim=var_dim,
groups=cell_meta[cluster_col])
Generating cluster AnnData files
Computing pairwise DMG
820 pairwise DMGs
... storing 'groups' as categorical
... storing 'groups' as categorical
pwdmg.dmg_table.to_hdf(f'{cluster_col}.PairwiseDMG.{mc_type}.hdf', key='data')
pwdmg.dmg_table.head()
Aggregating Cluster DMG¶
Weighted total AUROC aggregated from the pairwise comparisons.
Aggregate Pairwise Comparisons¶
cluster_dmgs = pwdmg.aggregate_pairwise_dmg(adata, groupby=cluster_col)
# save all the DMGs
with pd.HDFStore(f'{cluster_col}.ClusterRankedPWDMG.{mc_type}.hdf') as hdf:
for cluster, dmgs in cluster_dmgs.items():
hdf[cluster] = dmgs[dmgs > 0.0001]