Calculate Highly Variable Features And Get mC Fraction AnnData
Contents
Calculate Highly Variable Features And Get mC Fraction AnnData¶
Purpose¶
The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature’s normalized dispersion among cells.
Input¶
Filtered cell metadata;
MCDS files;
Feature list from basic feature filtering
Output¶
cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.
Import¶
import pathlib
import pandas as pd
import dask
from ALLCools.mcds import MCDS
Parameters¶
# If True, will load all data into memory.
# Computation will be much faster, but also very memory intensive, only use this for small number of cells (<10,000)
load = True
# change this to the path to your filtered metadata
metadata_path = 'CellMetadata.PassQC.csv.gz'
# change this to the paths to your MCDS files
mcds_path = '../../../data/Brain/snmC-seq2/Liu2021Nature.mcds'
# Feature list after basic filter
feature_path = 'FeatureList.BasicFilter.txt'
# Dimension name used to do clustering
obs_dim = 'cell' # observation
var_dim = 'chrom100k' # feature
# HVF method:
# SVR: regression based
# Bins: normalize dispersion per bin
hvf_method = 'SVR'
mch_pattern = 'CHN'
mcg_pattern = 'CGN'
n_top_feature = 20000
# Downsample cells
downsample = 20000
Load Data¶
Metadata¶
metadata = pd.read_csv(metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 16985 cells
metadata.head()
AllcPath | mCCCFrac | mCGFrac | mCGFracAdj | mCHFrac | mCHFracAdj | FinalReads | InputReads | MappedReads | DissectionRegion | BamFilteringRate | MappingRate | Plate | Col384 | Row384 | FANSDate | Slice | Sample | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10E_M_0 | /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... | 0.008198 | 0.822633 | 0.821166 | 0.041640 | 0.033718 | 1626504.0 | 4407752 | 2892347.0 | 10E | 0.562347 | 0.656195 | CEMBA190625-10E-1 | 0 | 0 | 190625 | 10 | 10E_190625 |
10E_M_1 | /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... | 0.006019 | 0.743035 | 0.741479 | 0.024127 | 0.018218 | 2009998.0 | 5524084 | 3657352.0 | 10E | 0.549577 | 0.662074 | CEMBA190625-10E-1 | 0 | 1 | 190625 | 10 | 10E_190625 |
10E_M_10 | /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... | 0.006569 | 0.750172 | 0.748520 | 0.027665 | 0.021235 | 1383636.0 | 3455260 | 2172987.0 | 10E | 0.636744 | 0.628892 | CEMBA190625-10E-1 | 19 | 0 | 190625 | 10 | 10E_190625 |
10E_M_101 | /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... | 0.006353 | 0.760898 | 0.759369 | 0.026547 | 0.020323 | 2474670.0 | 7245482 | 4778768.0 | 10E | 0.517847 | 0.659551 | CEMBA190625-10E-1 | 18 | 3 | 190625 | 10 | 10E_190625 |
10E_M_102 | /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... | 0.005409 | 0.752980 | 0.751637 | 0.019497 | 0.014164 | 2430290.0 | 7004754 | 4609570.0 | 10E | 0.527227 | 0.658063 | CEMBA190625-10E-1 | 19 | 2 | 190625 | 10 | 10E_190625 |
use_features = pd.read_csv(feature_path, header=None, index_col=0).index
use_features.name = var_dim
MCDS¶
total_mcds = MCDS.open(mcds_path,
var_dim=var_dim,
use_obs=metadata.index).sel({var_dim: use_features})
Add mC Rate¶
total_mcds.add_mc_rate(var_dim=var_dim,
normalize_per_cell=True,
clip_norm_value=10)
total_mcds
<xarray.MCDS> Dimensions: (cell: 16985, chrom100k: 24045, count_type: 2, mc_type: 2) Coordinates: * cell (cell) <U10 '10E_M_207' '10E_M_338' ... '9J_M_2969' * chrom100k (chrom100k) int64 30 31 32 33 ... 26335 26336 26337 chrom100k_bin_end (chrom100k) int64 dask.array<chunksize=(24045,), meta=np.ndarray> chrom100k_bin_start (chrom100k) int64 dask.array<chunksize=(24045,), meta=np.ndarray> chrom100k_chrom (chrom100k) <U5 dask.array<chunksize=(24045,), meta=np.ndarray> * count_type (count_type) <U3 'mc' 'cov' * mc_type (mc_type) <U3 'CGN' 'CHN' strand_type <U4 'both' Data variables: chrom100k_da (cell, chrom100k, mc_type, count_type) uint16 dask.array<chunksize=(3397, 2364, 2, 2), meta=np.ndarray> chrom100k_da_frac (cell, chrom100k, mc_type) float64 dask.array<chunksize=(3397, 2364, 2), meta=np.ndarray> Attributes: obs_dim: cell var_dim: chrom100k
If downsample¶
if downsample and total_cells > downsample:
# make a downsampled mcds
print(f'Downsample cells to {downsample} to calculate HVF.')
downsample_cell_ids = metadata.sample(downsample, random_state=0).index
mcds = total_mcds.sel(
{obs_dim: total_mcds.get_index(obs_dim).isin(downsample_cell_ids)})
else:
mcds = total_mcds
if load and (mcds.get_index('cell').size <= 20000):
# load the relevant data so the computation can be fater, watch out memory!
mcds[f'{var_dim}_da_frac'].load()
/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/dask/core.py:119: RuntimeWarning: invalid value encountered in true_divide
return func(*(_execute_task(a, cache) for a in args))
The RuntimeWarning is expected (due to cov == 0). You can ignore it.
Highly Variable Feature¶
mCH¶
if hvf_method == 'SVR':
# use SVR based method
mch_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
mc_type=mch_pattern,
n_top_feature=n_top_feature,
plot=True)
else:
# use bin based method
mch_hvf = mcds.calculate_hvf(var_dim=var_dim,
mc_type=mch_pattern,
min_mean=0,
max_mean=5,
n_top_feature=n_top_feature,
bin_min_features=5,
mean_binsize=0.05,
cov_binsize=100)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number: 24045
Highly Variable Feature: 20000 (83.2%)
Save AnnData¶
total_mcds.coords[f'{var_dim}_{mch_pattern}_feature_select'] = mcds.coords[
f'{var_dim}_{mch_pattern}_feature_select']
mch_adata = total_mcds.get_adata(mc_type=mch_pattern,
var_dim=var_dim,
select_hvf=True)
mch_adata.write_h5ad(f'mCH.HVF.h5ad')
mch_adata
/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/anndata/_core/anndata.py:1228: FutureWarning:
The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.
... storing 'chrom' as categorical
AnnData object with n_obs × n_vars = 16985 × 20000
var: 'bin_end', 'bin_start', 'chrom', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select'
mCG¶
if hvf_method == 'SVR':
# use SVR based method
mcg_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
mc_type=mcg_pattern,
n_top_feature=n_top_feature,
plot=True)
else:
# use bin based method
mcg_hvf = mcds.calculate_hvf(var_dim=var_dim,
mc_type=mcg_pattern,
min_mean=0,
max_mean=5,
n_top_feature=n_top_feature,
bin_min_features=5,
mean_binsize=0.02,
cov_binsize=20)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number: 24045
Highly Variable Feature: 20000 (83.2%)
Save AnnData¶
total_mcds.coords[f'{var_dim}_{mch_pattern}_feature_select'] = mcds.coords[
f'{var_dim}_{mch_pattern}_feature_select']
mcg_adata = total_mcds.get_adata(mc_type=mcg_pattern,
var_dim=var_dim,
select_hvf=True)
mcg_adata.write_h5ad(f'mCG.HVF.h5ad')
mcg_adata
/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/anndata/_core/anndata.py:1228: FutureWarning:
The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.
... storing 'chrom' as categorical
AnnData object with n_obs × n_vars = 16985 × 20000
var: 'bin_end', 'bin_start', 'chrom', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select', 'CGN_mean', 'CGN_dispersion', 'CGN_cov', 'CGN_score', 'CGN_feature_select'