Perform Cross-Modality Integration

import anndata
import scanpy as sc
import matplotlib.pyplot as plt

from harmonypy import run_harmony
from ALLCools.clustering import significant_pc_test
from ALLCools.plot import *

Load Datasets and select shared HVFs

smart_n_genes = 5000

smart_adata = anndata.read_h5ad('SMART.TotalAdata.norm_log1p.h5ad')
smart_hvf = smart_adata.var['dispersions_norm'].sort_values(ascending=False).dropna()[:smart_n_genes].index
tenx_n_genes = 5000

tenx_adata = anndata.read_h5ad('10X.PseudoCellAdata.norm_log1p.h5ad')
tenx_hvf = tenx_adata.var['dispersions_norm'].sort_values(ascending=False).dropna()[:tenx_n_genes].index
atac_n_genes = 5000

atac_adata = anndata.read_h5ad('ATAC.PseudoCellAdata.norm_log1p.h5ad')
atac_hvf = atac_adata.var['dispersions_norm'].sort_values(ascending=False).dropna()[:atac_n_genes].index
snmc_adata = anndata.read_h5ad('../input/snmC.Neuron.gene_CHN_fracs.h5ad')
snm3c_adata = anndata.read_h5ad('../input/snm3C.Neuron.gene_CHN_fracs.h5ad')
hvfs = atac_hvf.intersection(smart_hvf).intersection(tenx_hvf).intersection(
    snmc_adata.var_names).intersection(snm3c_adata.var_names)
with open('hvgs.txt', 'w') as f:
    f.write('\n'.join(hvfs))
print(hvfs.size, 'genes occured in all datasets.')
618 genes occured in all datasets.

Apply HVF selection and scale features

tenx_adata = tenx_adata[:, hvfs].copy()
sc.pp.scale(tenx_adata)
tenx_adata
AnnData object with n_obs × n_vars = 3679 × 618
    obs: 'n_cells', 'SubType', 'n_counts'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg'
smart_adata = smart_adata[:, hvfs].copy()
sc.pp.scale(smart_adata)
smart_adata
AnnData object with n_obs × n_vars = 4051 × 618
    obs: 'MajorType', 'SubType', 'n_genes', 'n_counts'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg'
    layers: 'raw'
atac_adata = atac_adata[:, hvfs].copy()
sc.pp.scale(atac_adata)
atac_adata
AnnData object with n_obs × n_vars = 5026 × 618
    obs: 'n_cells', 'SubType', 'n_counts'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg'
snmc_adata = snmc_adata[:, hvfs].copy()
sc.pp.scale(snmc_adata)
snmc_adata.X *= -1
snmc_adata
AnnData object with n_obs × n_vars = 8717 × 618
    obs: 'MajorType', 'SubType'
    var: 'mean', 'std'
snm3c_adata = snm3c_adata[:, hvfs].copy()
sc.pp.scale(snm3c_adata)
snm3c_adata.X *= -1
snm3c_adata
AnnData object with n_obs × n_vars = 3097 × 618
    obs: 'MajorType', 'SubType'
    var: 'mean', 'std'

Concatenate All Datasets

adata = smart_adata.concatenate([
    snmc_adata,
    tenx_adata,
    snm3c_adata,
    atac_adata,
],
                                index_unique='-')
adata.obs['batch'] = adata.obs['batch'].map({
    '0': 'SMART',
    '1': 'snmC',
    '2': '10X',
    '3': 'snm3C',
    '4': 'snATAC',
})
adata
AnnData object with n_obs × n_vars = 24570 × 618
    obs: 'MajorType', 'SubType', 'n_genes', 'n_counts', 'n_cells', 'batch'
    var: 'n_cells-0', 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'mean-0', 'std-0', 'mean-1', 'std-1', 'n_cells-2', 'highly_variable-2', 'means-2', 'dispersions-2', 'dispersions_norm-2', 'mean-2', 'std-2', 'mean-3', 'std-3', 'highly_variable-4', 'means-4', 'dispersions-4', 'dispersions_norm-4', 'mean-4', 'std-4'

PCA and Harmony Correction

sc.pp.pca(adata, n_comps=100)
sc.pl.pca_variance_ratio(adata)
n_components = significant_pc_test(adata, p_cutoff=0.2)
../../../../_images/04-integration_18_0.png
19 components passed P cutoff of 0.2.
Changing adata.obsm['X_pca'] from shape (24570, 100) to (24570, 19)
ho = run_harmony(adata.obsm['X_pca'],
                 meta_data=adata.obs,
                 vars_use='batch',
                 random_state=0,
                 nclust=100,
                 max_iter_harmony=20)
adata.obsm['X_pca'] = ho.Z_corr.T
2021-10-04 00:21:29,806 - harmonypy - INFO - Iteration 1 of 20
2021-10-04 00:21:42,253 - harmonypy - INFO - Iteration 2 of 20
2021-10-04 00:21:53,561 - harmonypy - INFO - Iteration 3 of 20
2021-10-04 00:22:05,244 - harmonypy - INFO - Iteration 4 of 20
2021-10-04 00:22:16,515 - harmonypy - INFO - Converged after 4 iterations
_ = plot_decomp_scatters(adata,
                         n_components=n_components,
                         hue='batch',
                         palette='tab10')
Red axis labels are used PCs
../../../../_images/04-integration_20_1.png

Co-Clustering

sc.pp.neighbors(adata, n_neighbors=25)
sc.tl.leiden(adata, resolution=1.5)
sc.tl.umap(adata)
adata.obs['umap_0'] = adata.obsm['X_umap'][:, 0]
adata.obs['umap_1'] = adata.obsm['X_umap'][:, 1]

Plot results

Plot co-embedding and co-clusters

fig, axes = plt.subplots(figsize=(8, 4), dpi=250, ncols=2)

ax = axes[0]
categorical_scatter(ax=ax,
                    data=adata,
                    hue='batch',
                    show_legend=True,
                    max_points=None,
                    s=1)

ax = axes[1]
categorical_scatter(ax=ax,
                    data=adata,
                    hue='leiden',
                    show_legend=True,
                    max_points=None,
                    s=1)
../../../../_images/04-integration_25_0.png

Plot Individual Datasets and Intra-dataset labels

All the cluster labels below were identified independently within each individual dataset.

snmC-seq2 Clusters

fig, axes = plt.subplots(figsize=(8, 4), dpi=250, ncols=2)

data = adata.obs[adata.obs['batch'] == 'snmC'].copy()
ax = axes[0]
categorical_scatter(ax=ax,
                    data=data,
                    hue='MajorType',
                    show_legend=False,
                    max_points=None,
                    text_anno='MajorType',
                    s=3)

ax = axes[1]
categorical_scatter(ax=ax,
                    data=data,
                    hue='leiden',
                    show_legend=False,
                    max_points=None,
                    text_anno='leiden',
                    s=3)
../../../../_images/04-integration_28_0.png

snm3C-seq Clusters

fig, axes = plt.subplots(figsize=(8, 4), dpi=250, ncols=2)

data = adata.obs[adata.obs['batch'] == 'snm3C'].copy()
ax = axes[0]
categorical_scatter(ax=ax,
                    data=data,
                    hue='MajorType',
                    show_legend=False,
                    max_points=None,
                    text_anno='MajorType',
                    s=3)

ax = axes[1]
categorical_scatter(ax=ax,
                    data=data,
                    hue='leiden',
                    show_legend=False,
                    max_points=None,
                    text_anno='leiden',
                    s=3)
../../../../_images/04-integration_30_0.png

10X snRNA-seq Clusters

fig, axes = plt.subplots(figsize=(8, 4), dpi=250, ncols=2)

data = adata.obs[adata.obs['batch'] == '10X'].copy()
data['MajorType'] = data['SubType'].str.split('_').str[-1]

ax = axes[0]
categorical_scatter(ax=ax,
                    data=data,
                    hue='MajorType',
                    show_legend=False,
                    max_points=None,
                    text_anno='MajorType',
                    s=3)

ax = axes[1]
categorical_scatter(ax=ax,
                    data=data,
                    hue='leiden',
                    show_legend=False,
                    max_points=None,
                    text_anno='leiden',
                    s=3)
../../../../_images/04-integration_32_0.png

SMART-seq Clusters

fig, axes = plt.subplots(figsize=(8, 4), dpi=250, ncols=2)

data = adata.obs[adata.obs['batch'] == 'SMART'].copy()

ax = axes[0]
categorical_scatter(ax=ax,
                    data=data,
                    hue='MajorType',
                    show_legend=False,
                    max_points=None,
                    text_anno='MajorType',
                    s=3)

ax = axes[1]
categorical_scatter(ax=ax,
                    data=data,
                    hue='leiden',
                    show_legend=False,
                    max_points=None,
                    text_anno='leiden',
                    s=3)
../../../../_images/04-integration_34_0.png

snATAC-seq Clusters

fig, axes = plt.subplots(figsize=(8, 4), dpi=250, ncols=2)

data = adata.obs[adata.obs['batch'] == 'snATAC'].copy()
data['MajorType'] = data['SubType'].apply(lambda i: '.'.join(i.split('.')[:2]))

ax = axes[0]
categorical_scatter(ax=ax,
                    data=data,
                    hue='MajorType',
                    show_legend=False,
                    max_points=None,
                    text_anno='MajorType',
                    s=3)

ax = axes[1]
categorical_scatter(ax=ax,
                    data=data,
                    hue='leiden',
                    show_legend=False,
                    max_points=None,
                    text_anno='leiden',
                    s=3)
../../../../_images/04-integration_36_0.png