Calculate Highly Variable Features And Get mC Fraction AnnData

Purpose

The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature’s normalized dispersion among cells.

Input

  • Filtered cell metadata;

  • MCDS files;

  • Feature list from basic feature filtering

Output

  • cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

Import

import pathlib
import pandas as pd
import dask
from ALLCools.mcds import MCDS

Parameters

# If True, will load all data into memory.
# Computation will be much faster, but also very memory intensive, only use this for small number of cells (<10,000)
load = True

# change this to the path to your filtered metadata
metadata_path = 'CellMetadata.PassQC.csv.gz'

# change this to the paths to your MCDS files
mcds_path = '../../../data/Brain/snmC-seq2/Liu2021Nature.mcds'

# Feature list after basic filter
feature_path = 'FeatureList.BasicFilter.txt'

# Dimension name used to do clustering
obs_dim = 'cell'  # observation
var_dim = 'chrom100k'  # feature

# HVF method:
# SVR: regression based
# Bins: normalize dispersion per bin
hvf_method = 'SVR'
mch_pattern = 'CHN'
mcg_pattern = 'CGN'
n_top_feature = 20000

# Downsample cells
downsample = 20000

Load Data

Metadata

metadata = pd.read_csv(metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 16985 cells
metadata.head()
AllcPath mCCCFrac mCGFrac mCGFracAdj mCHFrac mCHFracAdj FinalReads InputReads MappedReads DissectionRegion BamFilteringRate MappingRate Plate Col384 Row384 FANSDate Slice Sample
10E_M_0 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.008198 0.822633 0.821166 0.041640 0.033718 1626504.0 4407752 2892347.0 10E 0.562347 0.656195 CEMBA190625-10E-1 0 0 190625 10 10E_190625
10E_M_1 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006019 0.743035 0.741479 0.024127 0.018218 2009998.0 5524084 3657352.0 10E 0.549577 0.662074 CEMBA190625-10E-1 0 1 190625 10 10E_190625
10E_M_10 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006569 0.750172 0.748520 0.027665 0.021235 1383636.0 3455260 2172987.0 10E 0.636744 0.628892 CEMBA190625-10E-1 19 0 190625 10 10E_190625
10E_M_101 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006353 0.760898 0.759369 0.026547 0.020323 2474670.0 7245482 4778768.0 10E 0.517847 0.659551 CEMBA190625-10E-1 18 3 190625 10 10E_190625
10E_M_102 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.005409 0.752980 0.751637 0.019497 0.014164 2430290.0 7004754 4609570.0 10E 0.527227 0.658063 CEMBA190625-10E-1 19 2 190625 10 10E_190625
use_features = pd.read_csv(feature_path, header=None, index_col=0).index
use_features.name = var_dim

MCDS

total_mcds = MCDS.open(mcds_path,
                       var_dim=var_dim,
                       use_obs=metadata.index).sel({var_dim: use_features})

Add mC Rate

total_mcds.add_mc_rate(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds
<xarray.MCDS>
Dimensions:              (cell: 16985, chrom100k: 24045, count_type: 2, mc_type: 2)
Coordinates:
  * cell                 (cell) <U10 '10E_M_207' '10E_M_338' ... '9J_M_2969'
  * chrom100k            (chrom100k) int64 30 31 32 33 ... 26335 26336 26337
    chrom100k_bin_end    (chrom100k) int64 dask.array<chunksize=(24045,), meta=np.ndarray>
    chrom100k_bin_start  (chrom100k) int64 dask.array<chunksize=(24045,), meta=np.ndarray>
    chrom100k_chrom      (chrom100k) <U5 dask.array<chunksize=(24045,), meta=np.ndarray>
  * count_type           (count_type) <U3 'mc' 'cov'
  * mc_type              (mc_type) <U3 'CGN' 'CHN'
    strand_type          <U4 'both'
Data variables:
    chrom100k_da         (cell, chrom100k, mc_type, count_type) uint16 dask.array<chunksize=(3397, 2364, 2, 2), meta=np.ndarray>
    chrom100k_da_frac    (cell, chrom100k, mc_type) float64 dask.array<chunksize=(3397, 2364, 2), meta=np.ndarray>
Attributes:
    obs_dim:  cell
    var_dim:  chrom100k

If downsample

if downsample and total_cells > downsample:
    # make a downsampled mcds
    print(f'Downsample cells to {downsample} to calculate HVF.')
    downsample_cell_ids = metadata.sample(downsample, random_state=0).index
    mcds = total_mcds.sel(
        {obs_dim: total_mcds.get_index(obs_dim).isin(downsample_cell_ids)})
else:
    mcds = total_mcds
if load and (mcds.get_index('cell').size <= 20000):
    # load the relevant data so the computation can be fater, watch out memory!
    mcds[f'{var_dim}_da_frac'].load()
/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/dask/core.py:119: RuntimeWarning: invalid value encountered in true_divide
  return func(*(_execute_task(a, cache) for a in args))

The RuntimeWarning is expected (due to cov == 0). You can ignore it.

Highly Variable Feature

mCH

if hvf_method == 'SVR':
    # use SVR based method
    mch_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mch_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=True)
else:
    # use bin based method
    mch_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mch_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.05,
                                 cov_binsize=100)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24045
Highly Variable Feature:  20000 (83.2%)

Save AnnData

total_mcds.coords[f'{var_dim}_{mch_pattern}_feature_select'] = mcds.coords[
    f'{var_dim}_{mch_pattern}_feature_select']
mch_adata = total_mcds.get_adata(mc_type=mch_pattern,
                           var_dim=var_dim,
                           select_hvf=True)

mch_adata.write_h5ad(f'mCH.HVF.h5ad')

mch_adata
/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/anndata/_core/anndata.py:1228: FutureWarning:

The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.

... storing 'chrom' as categorical
AnnData object with n_obs × n_vars = 16985 × 20000
    var: 'bin_end', 'bin_start', 'chrom', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select'

mCG

if hvf_method == 'SVR':
    # use SVR based method
    mcg_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mcg_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=True)
else:
    # use bin based method
    mcg_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mcg_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.02,
                                 cov_binsize=20)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24045
Highly Variable Feature:  20000 (83.2%)

Save AnnData

total_mcds.coords[f'{var_dim}_{mch_pattern}_feature_select'] = mcds.coords[
    f'{var_dim}_{mch_pattern}_feature_select']
mcg_adata = total_mcds.get_adata(mc_type=mcg_pattern,
                                 var_dim=var_dim,
                                 select_hvf=True)

mcg_adata.write_h5ad(f'mCG.HVF.h5ad')

mcg_adata
/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/anndata/_core/anndata.py:1228: FutureWarning:

The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.

... storing 'chrom' as categorical
AnnData object with n_obs × n_vars = 16985 × 20000
    var: 'bin_end', 'bin_start', 'chrom', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select', 'CGN_mean', 'CGN_dispersion', 'CGN_cov', 'CGN_score', 'CGN_feature_select'