{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cluster Differentially Methylated Genes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:38:08.524584Z", "start_time": "2022-02-16T03:38:06.124950Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "from ALLCools.clustering import one_vs_rest_dmg" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parameters" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:39:54.199362Z", "start_time": "2022-02-16T03:39:54.195979Z" } }, "outputs": [], "source": [ "mcds_paths = 'geneslop2k_frac.mcds'\n", "cell_meta_path = '../step_by_step/100kb/L1.ClusteringResults.csv.gz'\n", "cluster_col = 'L1'\n", "\n", "obs_dim = 'cell'\n", "var_dim = 'geneslop2k'\n", "mc_type = 'CHN'\n", "\n", "top_n = 1000\n", "auroc_cutoff = 0.8\n", "adj_p_cutoff = 0.001\n", "fc_cutoff = 0.8\n", "max_cluster_cells = 2000\n", "max_other_fold = 5\n", "cpu = 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:38:10.251633Z", "start_time": "2022-02-16T03:38:10.100034Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AllcPathmCCCFracmCGFracmCGFracAdjmCHFracmCHFracAdjFinalReadsInputReadsMappedReadsDissectionRegion...SampleleidenmCHFrac.1tsne_0tsne_1L1L1_probaCellTypeAnnoumap_0umap_1
10E_M_0/gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E...0.0081980.8226330.8211660.0416400.0337181626504.044077522892347.010E...10E_190625130.04164057.602540-5.024663c110.864367MGE-Sst5.2887349.726882
10E_M_1/gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E...0.0060190.7430350.7414790.0241270.0182182009998.055240843657352.010E...10E_190625110.024127-45.191850-11.135287c70.669400CA3-3.7023487.514084
10E_M_10/gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E...0.0065690.7501720.7485200.0276650.0212351383636.034552602172987.010E...10E_190625110.027665-46.905564-8.491459c70.787267CA3-2.7975697.604081
10E_M_101/gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E...0.0063530.7608980.7593690.0265470.0203232474670.072454824778768.010E...10E_190625110.026547-53.480022-1.604433c70.526933CA3-0.3108488.465321
10E_M_102/gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E...0.0054090.7529800.7516370.0194970.0141642430290.070047544609570.010E...10E_19062570.019497-25.96799013.813133c300.924000CA10.252257-3.450731
\n", "

5 rows × 27 columns

\n", "
" ], "text/plain": [ " AllcPath mCCCFrac \\\n", "10E_M_0 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.008198 \n", "10E_M_1 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006019 \n", "10E_M_10 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006569 \n", "10E_M_101 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006353 \n", "10E_M_102 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.005409 \n", "\n", " mCGFrac mCGFracAdj mCHFrac mCHFracAdj FinalReads InputReads \\\n", "10E_M_0 0.822633 0.821166 0.041640 0.033718 1626504.0 4407752 \n", "10E_M_1 0.743035 0.741479 0.024127 0.018218 2009998.0 5524084 \n", "10E_M_10 0.750172 0.748520 0.027665 0.021235 1383636.0 3455260 \n", "10E_M_101 0.760898 0.759369 0.026547 0.020323 2474670.0 7245482 \n", "10E_M_102 0.752980 0.751637 0.019497 0.014164 2430290.0 7004754 \n", "\n", " MappedReads DissectionRegion ... Sample leiden mCHFrac.1 \\\n", "10E_M_0 2892347.0 10E ... 10E_190625 13 0.041640 \n", "10E_M_1 3657352.0 10E ... 10E_190625 11 0.024127 \n", "10E_M_10 2172987.0 10E ... 10E_190625 11 0.027665 \n", "10E_M_101 4778768.0 10E ... 10E_190625 11 0.026547 \n", "10E_M_102 4609570.0 10E ... 10E_190625 7 0.019497 \n", "\n", " tsne_0 tsne_1 L1 L1_proba CellTypeAnno umap_0 \\\n", "10E_M_0 57.602540 -5.024663 c11 0.864367 MGE-Sst 5.288734 \n", "10E_M_1 -45.191850 -11.135287 c7 0.669400 CA3 -3.702348 \n", "10E_M_10 -46.905564 -8.491459 c7 0.787267 CA3 -2.797569 \n", "10E_M_101 -53.480022 -1.604433 c7 0.526933 CA3 -0.310848 \n", "10E_M_102 -25.967990 13.813133 c30 0.924000 CA1 0.252257 \n", "\n", " umap_1 \n", "10E_M_0 9.726882 \n", "10E_M_1 7.514084 \n", "10E_M_10 7.604081 \n", "10E_M_101 8.465321 \n", "10E_M_102 -3.450731 \n", "\n", "[5 rows x 27 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cell_meta = pd.read_csv('../../cell_level/step_by_step/100kb/L1.ClusteringResults.csv.gz', index_col=0)\n", "cell_meta.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate DMG" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:42:10.661960Z", "start_time": "2022-02-16T03:39:56.597735Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Calculating cluster c0 DMGs.\n", "Calculating cluster c1 DMGs.\n", "Calculating cluster c10 DMGs.\n", "Calculating cluster c11 DMGs.\n", "Calculating cluster c12 DMGs.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/hanliu/miniconda3/envs/allcools_new/lib/python3.8/site-packages/xarray/core/indexing.py:1227: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n", "chunk and silence this warning, set the option\n", " >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n", " ... array[indexer]\n", "\n", "To avoid creating the large chunks, set the option\n", " >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", " ... array[indexer]\n", " return self.array[key]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Calculating cluster c13 DMGs.Calculating cluster c14 DMGs.\n", "Calculating cluster c15 DMGs.\n", "Calculating cluster c16 DMGs.\n", "Calculating cluster c17 DMGs.\n", "Calculating cluster c18 DMGs.\n", "c17 Finished.\n", "Calculating cluster c19 DMGs.\n", "c16 Finished.\n", "Calculating cluster c2 DMGs.\n", "c15 Finished.\n", "Calculating cluster c20 DMGs.\n", "c14 Finished.\n", "Calculating cluster c21 DMGs.\n", "c13 Finished.\n", "Calculating cluster c22 DMGs.\n", "c12 Finished.\n", "Calculating cluster c23 DMGs.\n", "c11 Finished.\n", "Calculating cluster c24 DMGs.\n", "c10 Finished.\n", "Calculating cluster c25 DMGs.\n", "c18 Finished.\n", "Calculating cluster c26 DMGs.\n", "c21 Finished.\n", "Calculating cluster c27 DMGs.\n", "c19 Finished.\n", "Calculating cluster c28 DMGs.\n", "c22 Finished.\n", "Calculating cluster c29 DMGs.\n", "c20 Finished.\n", "Calculating cluster c3 DMGs.\n", "c24 Finished.\n", "Calculating cluster c30 DMGs.\n", "c23 Finished.\n", "Calculating cluster c31 DMGs.\n", "c25 Finished.\n", "Calculating cluster c32 DMGs.\n", "Calculating cluster c33 DMGs.\n", "c27 Finished.\n", "c26 Finished.\n", "Calculating cluster c34 DMGs.\n", "c29 Finished.\n", "Calculating cluster c35 DMGs.\n", "c28 Finished.\n", "Calculating cluster c36 DMGs.\n", "c1 Finished.\n", "Calculating cluster c37 DMGs.\n", "c30 Finished.\n", "Calculating cluster c38 DMGs.\n", "c31 Finished.\n", "Calculating cluster c39 DMGs.\n", "c33 Finished.\n", "Calculating cluster c4 DMGs.\n", "c34 Finished.\n", "Calculating cluster c40 DMGs.\n", "c32 Finished.\n", "Calculating cluster c5 DMGs.\n", "c37 Finished.\n", "Calculating cluster c6 DMGs.\n", "c35 Finished.\n", "Calculating cluster c7 DMGs.\n", "c36 Finished.\n", "Calculating cluster c8 DMGs.\n", "c38 Finished.\n", "Calculating cluster c9 DMGs.\n", "c39 Finished.\n", "c40 Finished.\n", "c2 Finished.\n", "c3 Finished.\n", "c0 Finished.\n", "c7 Finished.\n", "c9 Finished.\n", "c8 Finished.\n", "c6 Finished.\n", "c5 Finished.\n", "c4 Finished.\n" ] } ], "source": [ "dmg_table = one_vs_rest_dmg(cell_meta,\n", " group=cluster_col,\n", " mcds_paths=mcds_paths,\n", " obs_dim=obs_dim,\n", " var_dim=var_dim,\n", " mc_type=mc_type,\n", " top_n=top_n,\n", " adj_p_cutoff=adj_p_cutoff,\n", " fc_cutoff=fc_cutoff,\n", " auroc_cutoff=auroc_cutoff,\n", " max_cluster_cells=max_cluster_cells,\n", " max_other_fold=max_other_fold,\n", " cpu=cpu)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2022-02-16T03:47:55.142943Z", "start_time": "2022-02-16T03:47:55.043092Z" } }, "outputs": [], "source": [ "dmg_table.to_hdf(f'{cluster_col}.OneVsRestDMG.hdf', key='data')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }