# Differential Methylated Genes - Pairwise

In [1]:
import pandas as pd
import anndata
from ALLCools.mcds import MCDS
from ALLCools.clustering import PairwiseDMG

## Parameters

In [2]:
adata_path = '../step_by_step/100kb/adata.with_coords.h5ad'
cluster_col = 'L1'

# change this to the paths to your MCDS files
obs_dim = 'cell'
var_dim = 'geneslop2k'

# DMG
mc_type = 'CHN'
top_n = 1000
adj_p_cutoff = 1e-3
delta_rate_cutoff = 0.3
auroc_cutoff = 0.9
random_state = 0
n_jobs = 30

## Load

In [4]:
adata = anndata.read_h5ad(adata_path)

cell_meta = adata.obs.copy()
cell_meta.index.name = obs_dim

gene_meta = pd.read_csv(f'GeneMetadata.csv.gz', index_col=0)

gene_mcds = MCDS.open(f'geneslop2k_frac.mcds', use_obs=cell_meta.index)
gene_mcds

Unnamed: 0,Array,Chunk
Bytes,817.79 kiB,817.79 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 817.79 kiB 817.79 kiB Shape (41871,) (41871,) Count 2 Tasks 1 Chunks Type numpy.ndarray",41871  1,

Unnamed: 0,Array,Chunk
Bytes,817.79 kiB,817.79 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,327.12 kiB,327.12 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 327.12 kiB 327.12 kiB Shape (41871,) (41871,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",41871  1,

Unnamed: 0,Array,Chunk
Bytes,327.12 kiB,327.12 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,327.12 kiB,327.12 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 327.12 kiB 327.12 kiB Shape (41871,) (41871,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",41871  1,

Unnamed: 0,Array,Chunk
Bytes,327.12 kiB,327.12 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,327.12 kiB,327.12 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 327.12 kiB 327.12 kiB Shape (41871,) (41871,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",41871  1,

Unnamed: 0,Array,Chunk
Bytes,327.12 kiB,327.12 kiB
Shape,"(41871,)","(41871,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.30 GiB,63.83 MiB
Shape,"(16985, 41871, 2)","(3397, 2463, 2)"
Count,86 Tasks,85 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 5.30 GiB 63.83 MiB Shape (16985, 41871, 2) (3397, 2463, 2) Count 86 Tasks 85 Chunks Type float32 numpy.ndarray",2  41871  16985,

Unnamed: 0,Array,Chunk
Bytes,5.30 GiB,63.83 MiB
Shape,"(16985, 41871, 2)","(3397, 2463, 2)"
Count,86 Tasks,85 Chunks
Type,float32,numpy.ndarray


## Pairwise DMG

In [5]:
pwdmg = PairwiseDMG(max_cell_per_group=1000,
                    top_n=top_n,
                    adj_p_cutoff=adj_p_cutoff,
                    delta_rate_cutoff=delta_rate_cutoff,
                    auroc_cutoff=auroc_cutoff,
                    random_state=random_state,
                    n_jobs=n_jobs)

In [8]:
pwdmg.fit_predict(x=gene_mcds[f'{var_dim}_da_frac'].sel(mc_type=mc_type), 
                  var_dim=var_dim,
                  groups=cell_meta[cluster_col])

Generating cluster AnnData files
Computing pairwise DMG
820 pairwise DMGs


... storing 'groups' as categorical
... storing 'groups' as categorical


In [9]:
pwdmg.dmg_table.to_hdf(f'{cluster_col}.PairwiseDMG.{mc_type}.hdf', key='data')
pwdmg.dmg_table.head()

## Aggregating Cluster DMG

Weighted total AUROC aggregated from the pairwise comparisons.

### Aggregate Pairwise Comparisons

In [10]:
cluster_dmgs = pwdmg.aggregate_pairwise_dmg(adata, groupby=cluster_col)

In [12]:
# save all the DMGs
with pd.HDFStore(f'{cluster_col}.ClusterRankedPWDMG.{mc_type}.hdf') as hdf:
    for cluster, dmgs in cluster_dmgs.items():
        hdf[cluster] = dmgs[dmgs > 0.0001]
