Source code for ALLCools.clustering.doublets.coverage_doublets

import pandas as pd
from collections import defaultdict
import pysam
import pathlib
import json
import numpy as np
from scipy.stats import poisson
from statsmodels.stats.multitest import multipletests
from concurrent.futures import ProcessPoolExecutor, as_completed


[docs]def _calculate_cell_record(allc_path, output_path, cov_cutoff=2, resolution=100): """Count the high coverage bins for each cell, save results to json""" allc_path = str(allc_path) output_path = str(output_path) cov_high_cutoff = int(cov_cutoff * 2) cell_records = {} i = 0 with pysam.TabixFile(allc_path) as allc: for i, line in enumerate(allc.fetch()): chrom, pos, *_, cov, _ = line.split('\t') cov = int(cov) if cov_cutoff < cov <= cov_high_cutoff: bin_id = int(pos) // resolution try: cell_records[chrom][bin_id] += 1 except KeyError: cell_records[chrom] = defaultdict(int) cell_records[chrom][bin_id] += 1 cell_total_c = i + 1 cell_records = { chrom: list(values.keys()) for chrom, values in cell_records.items() } # final output to disk total_records = {'total_c': cell_total_c, 'bins': cell_records} with open(output_path, 'w') as f: json.dump(total_records, f) return output_path
[docs]def calculate_blacklist_region(region_records, alpha=0.01): """Collect highly covered regions by region-wise poisson FDR p value < alpha""" # calculate region poisson mu sum_of_bin = 0 n_bin = 0 for chrom, chrom_values in region_records.items(): sum_of_bin += sum(chrom_values.values()) n_bin += len(chrom_values) mu = sum_of_bin / n_bin # calculate region FDR p cutoff total_p = [] for chrom, chrom_values in region_records.items(): chrom_values = pd.Series(chrom_values) p_values = poisson.sf(chrom_values.values, mu) total_p.append(p_values) total_p = np.concatenate(total_p) judge, *_ = multipletests(total_p, alpha=alpha, method='fdr_bh') p_max = total_p[judge].max() del total_p, judge # calculate region blacklist final_blacklist = {} for chrom, chrom_values in region_records.items(): chrom_values = pd.Series(chrom_values) p_values = poisson.sf(chrom_values.values, mu) final_blacklist[chrom] = list(chrom_values[p_values < p_max].index) return final_blacklist
[docs]def _calculate_cell_final_values(output_path, region_blacklist): """Calculate final cell values while remove blacklist""" with open(output_path) as f: cell_record = json.load(f) total_n = 0 for chrom, bins in cell_record['bins'].items(): total_n += len(set(bins) - region_blacklist[chrom]) return total_n, cell_record['total_c']
[docs]def coverage_doublets(allc_dict: dict, resolution: int = 100, cov_cutoff=2, region_alpha=0.01, tmp_dir='doublets_temp_dir', cpu=1, keep_tmp=False): """ Quantify cell high coverage bins for doublets evaluation Parameters ---------- allc_dict dict with cell_id as key, allc_path as value resolution genome bin resolution to quantify, bps cov_cutoff cutoff the cov, sites within cov_cutoff < cov <= 2 * cov_cutoff will be count region_alpha FDR adjusted P-value cutoff tmp_dir temporary dir to save the results cpu number of cpu to use keep_tmp Whether save the tem_dir for debugging Returns ------- """ tmp_dir = pathlib.Path(tmp_dir) tmp_dir.mkdir(exist_ok=True) # count each cell and collect region-wise sum in the same time region_records = {} def _sum_region(p): with open(p) as cr: cell_record = json.load(cr) for chrom, chrom_bins in cell_record['bins'].items(): if chrom not in region_records: region_records[chrom] = defaultdict(int) for bin_id in chrom_bins: region_records[chrom][bin_id] += 1 return cell_paths = {} with ProcessPoolExecutor(cpu) as exe: futures = {} for cell_id, path in allc_dict.items(): output_path = f"{tmp_dir}/{cell_id}.json" if pathlib.Path(output_path).exists(): # directly quantify region records _sum_region(output_path) cell_paths[cell_id] = output_path continue future = exe.submit(_calculate_cell_record, allc_path=path, output_path=output_path, resolution=resolution, cov_cutoff=cov_cutoff) futures[future] = cell_id # during calculating the cell records, also summarize region records for future in as_completed(futures): cell_id = futures[future] output_path = future.result() cell_paths[cell_id] = output_path _sum_region(output_path) # calculate dataset specific region blacklist region_blacklist = calculate_blacklist_region(region_records, alpha=region_alpha) with open(f'{tmp_dir}/region_blacklist.json', 'w') as f: json.dump(region_blacklist, f) # list to set, dump don't support set region_blacklist = {k: set(v) for k, v in region_blacklist.items()} # calculate cell final stats total_values = [] cells = [] for cell_id, output_path in cell_paths.items(): cell_values = _calculate_cell_final_values(output_path, region_blacklist) total_values.append(cell_values) cells.append(cell_id) total_values = pd.DataFrame(total_values, index=cells, columns=['TotalHCB', 'TotalC']) # this value don't have specific mathematical meaning, for plotting total_values['HCBRatio'] = total_values['TotalHCB'] * resolution / total_values['TotalC'] if not keep_tmp: import os os.rmdir(tmp_dir) return total_values