Source code for ALLCools.sandbox.motif.motif_scan

import pathlib
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

import msgpack
import pandas as pd
import xarray as xr

from ..._open import open_allc
from ...utilities import parse_mc_pattern


[docs]def _count_data_to_xarray(total_data: dict, allc_series): """total_data: key is ALLC path, value is series of motif cov/mc count sum from all regions of that ALLC""" total_data = pd.DataFrame(total_data).reset_index() dir_data_list = [] directions = [] for direction, direction_df in total_data.groupby('level_0'): data_list = [] mc_patterns = [] for mc_pattern, pattern_df in direction_df.groupby('level_1'): pattern_df = pattern_df.iloc[:, 2:].set_index('level_2') pattern_df.index.name = 'motif' pattern_df.columns.name = 'cell' data_list.append(xr.DataArray(pattern_df)) mc_patterns.append(mc_pattern) direction_da: xr.DataArray = xr.concat(data_list, dim='mc_type') direction_da.coords['mc_type'] = mc_patterns dir_data_list.append(direction_da) directions.append(direction) total_da: xr.DataArray = xr.concat(dir_data_list, dim='relative_strand') direction_transfer = {1: 'forward', 0: 'reverse'} total_da.coords['relative_strand'] = [direction_transfer[i] for i in directions] total_da.coords['cell'] = allc_series.index.tolist() return total_da
[docs]def _count_single_cmotif_bin_on_multiple_allc(cmotif_dict_path, allc_paths, region, count_binary, context_to_pattern): with open(cmotif_dict_path, 'rb') as f: c_motif_dict = msgpack.unpackb(f.read(), raw=True, use_list=False) mc_records = {} cov_records = {} for allc_path in allc_paths: with open_allc(allc_path, region=region) as allc: motif_mc_records = defaultdict(int) motif_cov_records = defaultdict(int) for line in allc: _, pos, _, context, mc, cov, _ = line.split('\t') pos = int(pos) try: motifs = c_motif_dict[pos] mc = int(mc) cov = int(cov) if count_binary: # only for single cell, either methylated or unmethylated for (rf_strand, motif_id) in motifs: if mc > 0 and mc != cov: # ambiguous mc and cov, just skip this continue motif_mc_records[ (rf_strand, context_to_pattern[context], motif_id)] += 0 if mc == 0 else 1 motif_cov_records[(rf_strand, context_to_pattern[context], motif_id)] += 1 else: for (rf_strand, motif_id) in motifs: motif_mc_records[(rf_strand, context_to_pattern[context], motif_id)] += mc motif_cov_records[(rf_strand, context_to_pattern[context], motif_id)] += cov except KeyError: continue mc_records[allc_path] = pd.Series(motif_mc_records) cov_records[allc_path] = pd.Series(motif_cov_records) return mc_records, cov_records
[docs]def allc_motif_scan(allc_table, output_path, mc_contexts, c_motif_dir, count_binary=True, cpu=1): """\ Scan a list of ALLC files using a C-Motif database. C-Motif Database, can be generated via 'allcools generate-cmotif-database' Save the integrated multi-dimensional array into netCDF4 format using xarray. Parameters ---------- allc_table {allc_table_doc} mc_contexts c_motif_dir A directory contains list of .msg files, each file records a dict, position is key, value is tuple of motif direction and id output_path count_binary Only use this for single cell allc, instead of sum mC or cov directly, will transfer mC and cov into [0, 1] when there is not ambiguity. cpu {cpu_basic_doc} Returns ------- """ if isinstance(allc_table, str): allc_series = pd.read_csv(allc_table, header=None, index_col=0, squeeze=True, sep='\t') if not isinstance(allc_series, pd.Series): raise ValueError('allc_table malformed, should only have 2 columns, 1. file_uid, 2. file_path') else: allc_series = allc_table if allc_series.index.duplicated().sum() != 0: raise ValueError('allc_table file uid have duplicates (1st column)') c_motif_dir = pathlib.Path(c_motif_dir).absolute() allc_paths = allc_series.tolist() context_to_pattern = {} for pattern in mc_contexts: for context in parse_mc_pattern(pattern): if context in context_to_pattern: raise ValueError('Do not support overlapped pattern') else: context_to_pattern[context] = pattern lookup_table = pd.read_hdf(c_motif_dir / 'LOOKUP_TABLE.hdf') # TODO make this function more memory efficient region_mc_records = [] region_cov_records = [] with ProcessPoolExecutor(cpu) as executor: future_dict = {} for _, (chrom, start, end, file_name) in lookup_table.iterrows(): region = f'{chrom}:{start}-{end}' cmotif_dict_path = str(c_motif_dir / file_name) future = executor.submit(_count_single_cmotif_bin_on_multiple_allc, cmotif_dict_path=cmotif_dict_path, allc_paths=allc_paths, region=region, count_binary=count_binary, context_to_pattern=context_to_pattern) future_dict[future] = region for future in as_completed(future_dict): region = future_dict[future] print(region) mc_records, cov_records = future.result() # the regions are not in order, # but this actually doesn't matter because we will sum them all together region_mc_records.append(mc_records) region_cov_records.append(cov_records) total_mc_data = {} total_cov_data = {} # aggregate regions for each ALLC for allc_path in allc_paths: # mc allc_mc_list = [mc_records[allc_path] for mc_records in region_mc_records] total_mc_data[allc_path] = pd.DataFrame(allc_mc_list).sum(axis=0) # cov allc_cov_list = [cov_records[allc_path] for cov_records in region_cov_records] total_cov_data[allc_path] = pd.DataFrame(allc_cov_list).sum(axis=0) # aggregate all ALLC total_mc_da = _count_data_to_xarray(total_mc_data, allc_series) total_cov_da = _count_data_to_xarray(total_cov_data, allc_series) total_da: xr.DataArray = xr.concat([total_mc_da, total_cov_da], dim='count_type') total_da.coords['count_type'] = ['mc', 'cov'] with open(c_motif_dir / 'MOTIF_NAMES.msg', 'rb') as f: motif_names = msgpack.unpackb(f.read(), raw=False, use_list=False) motif_names = pd.Series({v: k for k, v in motif_names.items()}) motif_names.index.name = 'motif' total_da.coords['motif_names'] = motif_names total_da = total_da.set_index(motif='motif_names') total_da.to_netcdf(output_path) return