{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Annotate RegionDS\n", "\n", "After getting an {{ RegionDS }} from DMR calling or any other genome region sets, we can annotate the regions with other epigenomic profiles or genomic features stored in the BigWig or BED format.\n", "\n", "For example, in this section, we will annotate the DMR RegionDS with chromatin accessibility profiles and some general genomic features." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:53.936192Z", "start_time": "2022-01-09T23:39:52.372477Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import pathlib\n", "from ALLCools.mcds import RegionDS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Open RegionDS" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:54.082631Z", "start_time": "2022-01-09T23:39:53.938643Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using dmr as region_dim\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.RegionDS>\n",
       "Dimensions:      (count_type: 2, dmr: 132, sample: 20)\n",
       "Coordinates:\n",
       "  * count_type   (count_type) <U3 'mc' 'cov'\n",
       "  * dmr          (dmr) <U9 'chr1-0' 'chr1-1' ... 'chr19-122' 'chr19-123'\n",
       "    dmr_chrom    (dmr) <U5 'chr1' 'chr1' 'chr1' ... 'chr19' 'chr19' 'chr19'\n",
       "    dmr_end      (dmr) int64 10002172 10003542 10003967 ... 5099203 5099952\n",
       "    dmr_length   (dmr) int64 2 305 54 2 2 2 2 ... 589 924 632 842 195 399 335\n",
       "    dmr_ndms     (dmr) int64 1 7 2 1 1 1 1 13 3 2 1 ... 2 1 2 7 13 19 9 9 3 6 13\n",
       "    dmr_start    (dmr) int64 10002170 10003237 10003913 ... 5098804 5099617\n",
       "  * sample       (sample) <U18 'snm3C_ASC' 'snm3C_CA1' ... 'snmC_ODC' 'snmC_OPC'\n",
       "Data variables:\n",
       "    dmr_da       (sample, dmr, count_type) uint32 4294967295 ... 4294967295\n",
       "    dmr_da_frac  (sample, dmr) float32 1.0 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0\n",
       "    dmr_state    (sample, dmr) int8 -1 0 0 1 -1 -1 -1 0 0 ... 0 0 0 0 0 0 0 0 0\n",
       "Attributes:\n",
       "    region_dim:          dmr\n",
       "    region_ds_location:  /home/hanliu/pkg/ALLCools_pycharm/docs/allcools/clus...\n",
       "    chrom_size_path:     /home/hanliu/pkg/ALLCools_pycharm/docs/allcools/clus...
" ], "text/plain": [ "\n", "Dimensions: (count_type: 2, dmr: 132, sample: 20)\n", "Coordinates:\n", " * count_type (count_type) ` method of RegionDS can help scan the regions on each BigWig file, resulting a new data variable stored in the RegionDS" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:54.422026Z", "start_time": "2022-01-09T23:39:54.413204Z" } }, "outputs": [ { "data": { "text/plain": [ "CA23 ../../data/HIPBulk/atac_bulk/HIP_snATAC_CA23.bw\n", "CGE ../../data/HIPBulk/atac_bulk/HIP_snATAC_CGE.bw\n", "ASC ../../data/HIPBulk/atac_bulk/HIP_snATAC_ASC.bw\n", "MGE ../../data/HIPBulk/atac_bulk/HIP_snATAC_MGE.bw\n", "CA1 ../../data/HIPBulk/atac_bulk/HIP_snATAC_CA1.bw\n", "ODC ../../data/HIPBulk/atac_bulk/HIP_snATAC_ODC.bw\n", "MGC ../../data/HIPBulk/atac_bulk/HIP_snATAC_MGC.bw\n", "NonN ../../data/HIPBulk/atac_bulk/HIP_snATAC_NonN.bw\n", "OPC ../../data/HIPBulk/atac_bulk/HIP_snATAC_OPC.bw\n", "DG ../../data/HIPBulk/atac_bulk/HIP_snATAC_DG.bw\n", "dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# prepare the bigwig tab-separated table, first column is cluster name, second column is BigWig path\n", "bigwig_dir = '../../data/HIPBulk/atac_bulk/'\n", "bigwigs = pd.Series({\n", " p.name.split('.')[0].split('_')[-1]: str(p)\n", " for p in pathlib.Path(bigwig_dir).glob('HIP_snATAC_*.bw')\n", "})\n", "bigwigs.to_csv('test_bigwig.csv', header=False)\n", "bigwigs" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:55.515554Z", "start_time": "2022-01-09T23:39:55.016526Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Use chunk size 1\n" ] } ], "source": [ "dmr_ds.annotate_by_bigwigs(slop=250,\n", " bigwig_table='test_bigwig.csv',\n", " dim='snATAC',\n", " cpu=30)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:56.027011Z", "start_time": "2022-01-09T23:39:56.006752Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'dmr_snATAC_da' (dmr: 132, snATAC: 10)>\n",
       "dask.array<open_dataset-03cdf5fb902771957a3508f7758e6aeedmr_snATAC_da, shape=(132, 10), dtype=float32, chunksize=(132, 1), chunktype=numpy.ndarray>\n",
       "Coordinates:\n",
       "  * dmr         (dmr) <U9 'chr1-0' 'chr1-1' 'chr1-2' ... 'chr19-122' 'chr19-123'\n",
       "    dmr_chrom   (dmr) <U5 'chr1' 'chr1' 'chr1' ... 'chr19' 'chr19' 'chr19'\n",
       "    dmr_end     (dmr) int64 10002172 10003542 10003967 ... 5099203 5099952\n",
       "    dmr_length  (dmr) int64 2 305 54 2 2 2 2 440 ... 589 924 632 842 195 399 335\n",
       "    dmr_ndms    (dmr) int64 1 7 2 1 1 1 1 13 3 2 1 ... 2 1 2 7 13 19 9 9 3 6 13\n",
       "    dmr_start   (dmr) int64 10002170 10003237 10003913 ... 5098804 5099617\n",
       "  * snATAC      (snATAC) <U4 'CA23' 'CGE' 'ASC' 'MGE' ... 'NonN' 'OPC' 'DG'
" ], "text/plain": [ "\n", "dask.array\n", "Coordinates:\n", " * dmr (dmr) ` method of RegionDS can help scan the regions on each BigWig file, resulting a new data variable stored in the RegionDS. The output dataset is a boolean matrix recording whether a DMR is overlapping with one feature." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:57.017084Z", "start_time": "2022-01-09T23:39:57.008444Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "CGI ../../data/genome/genome_feature/CGI.merge.sor...\n", "CGI_Shore ../../data/genome/genome_feature/CGI_Shore.mer...\n", "blacklist ../../data/genome/genome_feature/blacklist.mm1...\n", "gene.all ../../data/genome/genome_feature/gene.all.merg...\n", "gene.lincRNA ../../data/genome/genome_feature/gene.lincRNA....\n", "intron.all ../../data/genome/genome_feature/intron.all.me...\n", "intron.first ../../data/genome/genome_feature/intron.first....\n", "intron.protein_coding ../../data/genome/genome_feature/intron.protei...\n", "promoter.all ../../data/genome/genome_feature/promoter.all....\n", "splicing_site_slop100 ../../data/genome/genome_feature/splicing_site...\n", "exon.all ../../data/genome/genome_feature/exon.all.merg...\n", "exon.first ../../data/genome/genome_feature/exon.first.me...\n", "stop_codon.all ../../data/genome/genome_feature/stop_codon.al...\n", "start_codon.all ../../data/genome/genome_feature/start_codon.a...\n", "exon.protein_coding ../../data/genome/genome_feature/exon.protein_...\n", "transcript.all ../../data/genome/genome_feature/transcript.al...\n", "TSS.all ../../data/genome/genome_feature/TSS.all.merge...\n", "TSS.protein_coding ../../data/genome/genome_feature/TSS.protein_c...\n", "UTR3.all ../../data/genome/genome_feature/UTR3.all.merg...\n", "UTR3.protein_coding ../../data/genome/genome_feature/UTR3.protein_...\n", "UTR5.all ../../data/genome/genome_feature/UTR5.all.merg...\n", "UTR5.protein_coding ../../data/genome/genome_feature/UTR5.protein_...\n", "gene.protein_coding ../../data/genome/genome_feature/gene.protein_...\n", "promoter.protein_coding ../../data/genome/genome_feature/promoter.prot...\n", "transcript.protein_coding ../../data/genome/genome_feature/transcript.pr...\n", "dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genome_feature_dir = '../../data/genome/genome_feature/'\n", "genome_feature_beds = {\n", " '.'.join(p.name.split('.')[:-4]): str(p)\n", " for p in pathlib.Path(genome_feature_dir).glob('*.bed.gz')\n", "}\n", "\n", "beds = pd.Series(genome_feature_beds)\n", "beds.to_csv('test_genome_featue_bed.csv', header=False)\n", "beds" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:59.574381Z", "start_time": "2022-01-09T23:39:57.457022Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Use chunk size 1\n" ] } ], "source": [ "dmr_ds.annotate_by_beds(slop=250,\n", " bed_table='test_genome_featue_bed.csv',\n", " dim='genome-features',\n", " bed_sorted=False,\n", " cpu=30)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2022-01-09T23:39:59.595558Z", "start_time": "2022-01-09T23:39:59.576313Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'dmr_genome-features_da' (dmr: 132, genome-features: 25)>\n",
       "dask.array<open_dataset-50fa93a20cd098fbd6ef8798079cfba4dmr_genome-features_da, shape=(132, 25), dtype=bool, chunksize=(132, 1), chunktype=numpy.ndarray>\n",
       "Coordinates:\n",
       "  * dmr              (dmr) <U9 'chr1-0' 'chr1-1' ... 'chr19-122' 'chr19-123'\n",
       "    dmr_chrom        (dmr) <U5 'chr1' 'chr1' 'chr1' ... 'chr19' 'chr19' 'chr19'\n",
       "    dmr_end          (dmr) int64 10002172 10003542 10003967 ... 5099203 5099952\n",
       "    dmr_length       (dmr) int64 2 305 54 2 2 2 2 ... 924 632 842 195 399 335\n",
       "    dmr_ndms         (dmr) int64 1 7 2 1 1 1 1 13 3 2 ... 1 2 7 13 19 9 9 3 6 13\n",
       "    dmr_start        (dmr) int64 10002170 10003237 10003913 ... 5098804 5099617\n",
       "  * genome-features  (genome-features) <U25 'CGI' ... 'transcript.protein_cod...
" ], "text/plain": [ "\n", "dask.array\n", "Coordinates:\n", " * dmr (dmr) \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.RegionDS>\n",
       "Dimensions:        (count_type: 2, dmr: 2, sample: 20, snATAC: 10)\n",
       "Coordinates:\n",
       "  * count_type     (count_type) <U3 'mc' 'cov'\n",
       "  * dmr            (dmr) <U9 'chr1-0' 'chr1-1'\n",
       "    dmr_chrom      (dmr) <U5 'chr1' 'chr1'\n",
       "    dmr_end        (dmr) int64 10002172 10003542\n",
       "    dmr_length     (dmr) int64 2 305\n",
       "    dmr_ndms       (dmr) int64 1 7\n",
       "    dmr_start      (dmr) int64 10002170 10003237\n",
       "  * sample         (sample) <U18 'snm3C_ASC' 'snm3C_CA1' ... 'snmC_OPC'\n",
       "  * snATAC         (snATAC) <U4 'CA23' 'CGE' 'ASC' 'MGE' ... 'NonN' 'OPC' 'DG'\n",
       "Data variables:\n",
       "    dmr_da         (sample, dmr, count_type) uint32 4294967295 ... 4294967295\n",
       "    dmr_da_frac    (sample, dmr) float32 1.0 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0\n",
       "    dmr_state      (sample, dmr) int8 -1 0 0 1 0 1 -1 0 -1 ... 0 1 0 0 1 0 0 0\n",
       "    dmr_snATAC_da  (dmr, snATAC) float32 0.09983 0.02372 ... 0.02372 0.03385\n",
       "Attributes:\n",
       "    region_dim:          dmr\n",
       "    region_ds_location:  /home/hanliu/pkg/ALLCools_pycharm/docs/allcools/clus...\n",
       "    chrom_size_path:     /home/hanliu/pkg/ALLCools_pycharm/docs/allcools/clus...
" ], "text/plain": [ "\n", "Dimensions: (count_type: 2, dmr: 2, sample: 20, snATAC: 10)\n", "Coordinates:\n", " * count_type (count_type)