{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Differential Methylated Genes - Pairwise" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:53:04.027240Z", "start_time": "2022-02-16T03:53:02.027970Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import anndata\n", "from ALLCools.mcds import MCDS\n", "from ALLCools.clustering import PairwiseDMG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parameters" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:53:10.918193Z", "start_time": "2022-02-16T03:53:10.915252Z" }, "tags": [ "parameters" ] }, "outputs": [], "source": [ "adata_path = '../step_by_step/100kb/adata.with_coords.h5ad'\n", "cluster_col = 'L1'\n", "\n", "# change this to the paths to your MCDS files\n", "obs_dim = 'cell'\n", "var_dim = 'geneslop2k'\n", "\n", "# DMG\n", "mc_type = 'CHN'\n", "top_n = 1000\n", "adj_p_cutoff = 1e-3\n", "delta_rate_cutoff = 0.3\n", "auroc_cutoff = 0.9\n", "random_state = 0\n", "n_jobs = 30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:53:30.717105Z", "start_time": "2022-02-16T03:53:30.196886Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.MCDS>\n",
       "Dimensions:              (cell: 16985, geneslop2k: 41871, mc_type: 2)\n",
       "Coordinates:\n",
       "  * cell                 (cell) <U10 '10E_M_207' '10E_M_338' ... '9J_M_2969'\n",
       "  * geneslop2k           (geneslop2k) <U21 'ENSMUSG00000102693.1' ... 'ENSMUS...\n",
       "    geneslop2k_chrom     (geneslop2k) <U5 dask.array<chunksize=(41871,), meta=np.ndarray>\n",
       "    geneslop2k_cov_mean  (geneslop2k) float64 dask.array<chunksize=(41871,), meta=np.ndarray>\n",
       "    geneslop2k_end       (geneslop2k) int64 dask.array<chunksize=(41871,), meta=np.ndarray>\n",
       "    geneslop2k_start     (geneslop2k) int64 dask.array<chunksize=(41871,), meta=np.ndarray>\n",
       "  * mc_type              (mc_type) <U3 'CGN' 'CHN'\n",
       "    strand_type          <U4 'both'\n",
       "Data variables:\n",
       "    geneslop2k_da_frac   (cell, geneslop2k, mc_type) float32 dask.array<chunksize=(3397, 2463, 2), meta=np.ndarray>\n",
       "Attributes:\n",
       "    obs_dim:  cell\n",
       "    var_dim:  geneslop2k
" ], "text/plain": [ "\n", "Dimensions: (cell: 16985, geneslop2k: 41871, mc_type: 2)\n", "Coordinates:\n", " * cell (cell) \n", " geneslop2k_cov_mean (geneslop2k) float64 dask.array\n", " geneslop2k_end (geneslop2k) int64 dask.array\n", " geneslop2k_start (geneslop2k) int64 dask.array\n", " * mc_type (mc_type) \n", "Attributes:\n", " obs_dim: cell\n", " var_dim: geneslop2k" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata = anndata.read_h5ad(adata_path)\n", "\n", "cell_meta = adata.obs.copy()\n", "cell_meta.index.name = obs_dim\n", "\n", "gene_meta = pd.read_csv(f'GeneMetadata.csv.gz', index_col=0)\n", "\n", "gene_mcds = MCDS.open(f'geneslop2k_frac.mcds', use_obs=cell_meta.index)\n", "gene_mcds" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pairwise DMG" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:53:32.676820Z", "start_time": "2022-02-16T03:53:32.673935Z" } }, "outputs": [], "source": [ "pwdmg = PairwiseDMG(max_cell_per_group=1000,\n", " top_n=top_n,\n", " adj_p_cutoff=adj_p_cutoff,\n", " delta_rate_cutoff=delta_rate_cutoff,\n", " auroc_cutoff=auroc_cutoff,\n", " random_state=random_state,\n", " n_jobs=n_jobs)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T04:01:07.495193Z", "start_time": "2022-02-16T03:55:00.310372Z" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Generating cluster AnnData files\n", "Computing pairwise DMG\n", "820 pairwise DMGs\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "... storing 'groups' as categorical\n", "... storing 'groups' as categorical\n" ] } ], "source": [ "pwdmg.fit_predict(x=gene_mcds[f'{var_dim}_da_frac'].sel(mc_type=mc_type), \n", " var_dim=var_dim,\n", " groups=cell_meta[cluster_col])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T04:01:08.080274Z", "start_time": "2022-02-16T04:01:07.497457Z" } }, "outputs": [], "source": [ "pwdmg.dmg_table.to_hdf(f'{cluster_col}.PairwiseDMG.{mc_type}.hdf', key='data')\n", "pwdmg.dmg_table.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregating Cluster DMG\n", "\n", "Weighted total AUROC aggregated from the pairwise comparisons." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Aggregate Pairwise Comparisons" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T04:01:08.983316Z", "start_time": "2022-02-16T04:01:08.081362Z" } }, "outputs": [], "source": [ "cluster_dmgs = pwdmg.aggregate_pairwise_dmg(adata, groupby=cluster_col)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T04:14:18.139563Z", "start_time": "2022-02-16T04:14:17.928962Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# save all the DMGs\n", "with pd.HDFStore(f'{cluster_col}.ClusterRankedPWDMG.{mc_type}.hdf') as hdf:\n", " for cluster, dmgs in cluster_dmgs.items():\n", " hdf[cluster] = dmgs[dmgs > 0.0001]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }