Differential Methylated Genes - Pairwise

import pandas as pd
import anndata
from ALLCools.mcds import MCDS
from ALLCools.clustering import PairwiseDMG

Parameters

adata_path = '../step_by_step/100kb/adata.with_coords.h5ad'
cluster_col = 'L1'

# change this to the paths to your MCDS files
obs_dim = 'cell'
var_dim = 'geneslop2k'

# DMG
mc_type = 'CHN'
top_n = 1000
adj_p_cutoff = 1e-3
delta_rate_cutoff = 0.3
auroc_cutoff = 0.9
random_state = 0
n_jobs = 30

Load

adata = anndata.read_h5ad(adata_path)

cell_meta = adata.obs.copy()
cell_meta.index.name = obs_dim

gene_meta = pd.read_csv(f'GeneMetadata.csv.gz', index_col=0)

gene_mcds = MCDS.open(f'geneslop2k_frac.mcds', use_obs=cell_meta.index)
gene_mcds
<xarray.MCDS>
Dimensions:              (cell: 16985, geneslop2k: 41871, mc_type: 2)
Coordinates:
  * cell                 (cell) <U10 '10E_M_207' '10E_M_338' ... '9J_M_2969'
  * geneslop2k           (geneslop2k) <U21 'ENSMUSG00000102693.1' ... 'ENSMUS...
    geneslop2k_chrom     (geneslop2k) <U5 dask.array<chunksize=(41871,), meta=np.ndarray>
    geneslop2k_cov_mean  (geneslop2k) float64 dask.array<chunksize=(41871,), meta=np.ndarray>
    geneslop2k_end       (geneslop2k) int64 dask.array<chunksize=(41871,), meta=np.ndarray>
    geneslop2k_start     (geneslop2k) int64 dask.array<chunksize=(41871,), meta=np.ndarray>
  * mc_type              (mc_type) <U3 'CGN' 'CHN'
    strand_type          <U4 'both'
Data variables:
    geneslop2k_da_frac   (cell, geneslop2k, mc_type) float32 dask.array<chunksize=(3397, 2463, 2), meta=np.ndarray>
Attributes:
    obs_dim:  cell
    var_dim:  geneslop2k

Pairwise DMG

pwdmg = PairwiseDMG(max_cell_per_group=1000,
                    top_n=top_n,
                    adj_p_cutoff=adj_p_cutoff,
                    delta_rate_cutoff=delta_rate_cutoff,
                    auroc_cutoff=auroc_cutoff,
                    random_state=random_state,
                    n_jobs=n_jobs)
pwdmg.fit_predict(x=gene_mcds[f'{var_dim}_da_frac'].sel(mc_type=mc_type), 
                  var_dim=var_dim,
                  groups=cell_meta[cluster_col])
Generating cluster AnnData files
Computing pairwise DMG
820 pairwise DMGs
... storing 'groups' as categorical
... storing 'groups' as categorical
pwdmg.dmg_table.to_hdf(f'{cluster_col}.PairwiseDMG.{mc_type}.hdf', key='data')
pwdmg.dmg_table.head()

Aggregating Cluster DMG

Weighted total AUROC aggregated from the pairwise comparisons.

Aggregate Pairwise Comparisons

cluster_dmgs = pwdmg.aggregate_pairwise_dmg(adata, groupby=cluster_col)
# save all the DMGs
with pd.HDFStore(f'{cluster_col}.ClusterRankedPWDMG.{mc_type}.hdf') as hdf:
    for cluster, dmgs in cluster_dmgs.items():
        hdf[cluster] = dmgs[dmgs > 0.0001]