Source code for ALLCools.table_to_allc

import subprocess

import numpy as np
import pandas as pd
import pysam

from ALLCools.utilities import reverse_complement, parse_chrom_size
from ._doc import *


[docs]def mode_mc_cov(table, mc, cov): mc = table[mc].astype(int) cov = table[cov].astype(int) frac = np.round(mc / cov).astype(int) values = pd.DataFrame({'mc': mc, 'cov': cov, 'frac': frac}) values = values[['mc', 'cov', 'frac']] return values
[docs]def mode_mc_uc(table, mc, uc): mc = table[mc].astype(int) cov = mc + table[uc].astype(int) frac = np.round(mc / cov).astype(int) values = pd.DataFrame({'mc': mc, 'cov': cov, 'frac': frac}) values = values[['mc', 'cov', 'frac']] return values
[docs]def mode_uc_cov(table, uc, cov): cov = table[cov].astype(int) mc = cov - table[uc].astype(int) frac = np.round(mc / cov).astype(int) values = pd.DataFrame({'mc': mc, 'cov': cov, 'frac': frac}) values = values[['mc', 'cov', 'frac']] return values
[docs]def mode_mc_frac_cov(table, mc_frac, cov): frac = table[mc_frac].astype(float) cov = table[cov].astype(int) mc = np.round(cov * frac).astype(int) frac = np.round(frac).astype(int) values = pd.DataFrame({'mc': mc, 'cov': cov, 'frac': frac}) values = values[['mc', 'cov', 'frac']] return values
[docs]def mode_mc_frac_mc(table, mc_frac, mc): frac = table[mc_frac].astype(float) mc = table[mc].astype(int) cov = np.round(mc / frac).astype(int) frac = np.round(frac).astype(int) values = pd.DataFrame({'mc': mc, 'cov': cov, 'frac': frac}) values = values[['mc', 'cov', 'frac']] return values
[docs]def mode_mc_frac_uc(table, mc_frac, uc): frac = table[mc_frac].astype(float) uc = table[uc].astype(int) cov = np.round(uc / (1 - frac)).astype(int) mc = cov - uc frac = np.round(frac).astype(int) values = pd.DataFrame({'mc': mc, 'cov': cov, 'frac': frac}) values = values[['mc', 'cov', 'frac']] return values
[docs]def mode_mc_frac_pseudo_count(table, mc_frac, pseudo_count): frac = table[mc_frac].astype(float) mc = np.round(pseudo_count * frac).astype(int) frac = np.round(frac).astype(int) values = pd.DataFrame({'mc': mc, 'frac': frac}) values['cov'] = pseudo_count values = values[['mc', 'cov', 'frac']] return values
[docs]def get_strand(chrom, pos, fasta): base = fasta.fetch(chrom, pos - 1, pos).lower() return '+' if base == 'c' else '-'
[docs]def get_context(chrom, pos, strand, fasta, num_upstream_bases, num_downstream_bases): if strand == '+': context = fasta.fetch(chrom, pos - 1 - num_upstream_bases, pos + num_downstream_bases).upper() else: context = reverse_complement( fasta.fetch(chrom, pos - 1 - num_downstream_bases, pos + num_upstream_bases).upper()) return context
[docs]def get_strand_and_context(table, chrom, pos, strand, context, fasta_path, num_upstream_bases, num_downstream_bases): if strand is not None and context is not None: return table.iloc[:, [chrom, pos, strand, context]] with pysam.FastaFile(fasta_path) as fasta: if strand is None: strand = table.shape[1] table[strand] = table.apply( lambda row: get_strand(row[chrom], row[pos], fasta), axis=1) if context is None: context = table.shape[1] table[context] = table.apply( lambda row: get_context(row[chrom], row[pos], row[ strand], fasta, num_upstream_bases, num_downstream_bases), axis=1) return table.iloc[:, [chrom, pos, strand, context]]
[docs]def dataframe_to_allc(table, add_chr=False, chrom=0, pos=1, strand=None, context=None, fasta_path=None, chrom_sizes=None, mc=None, uc=None, cov=None, mc_frac=None, pseudo_count=1, num_upstream_bases=0, num_downstream_bases=2): # change columns to int table.columns = list(range(table.shape[1])) # check genome coords if (chrom is None) or (pos is None): raise ValueError('Must provide chrom and pos') if (strand is None) or (context is None): if fasta_path is None: raise ValueError('Must provide fasta_path if strand or ' 'context is None, need to use the genome ' 'FASTA to parse strand or context.') if chrom_sizes is not None: chroms = set(parse_chrom_size(chrom_sizes).keys()) elif fasta_path is not None: with pysam.FastaFile(fasta_path) as fasta: chroms = set(fasta.references) else: chroms = None # check allc values if (mc is not None) and (cov is not None): value_conversion_func = mode_mc_cov mode = 'mc+cov' elif (mc is not None) and (uc is not None): value_conversion_func = mode_mc_uc mode = 'mc+uc' elif (uc is not None) and (cov is not None): value_conversion_func = mode_uc_cov mode = 'uc+cov' elif (mc_frac is not None) and (cov is not None): value_conversion_func = mode_mc_frac_cov mode = 'mc_frac+cov' elif (mc_frac is not None) and (mc is not None): value_conversion_func = mode_mc_frac_mc mode = 'mc_frac+mc' elif (mc_frac is not None) and (uc is not None): value_conversion_func = mode_mc_frac_uc mode = 'mc_frac+uc' elif (mc_frac is not None) and (pseudo_count is not None): value_conversion_func = mode_mc_frac_pseudo_count mode = 'mc_frac+pseudo_count' else: modes = [ 'mc+cov', 'mc+uc', 'uc+cov', 'mc_frac+cov', 'mc_frac+mc', 'mc_frac+uc', 'mc_frac+pseudo_count' ] raise ValueError(f'Need to provide one of these combinations in minimum: {modes}') # print(f'Using mode {mode} to get cytosine base counts.') # add chr to chrom names or not, user specify if add_chr: table[chrom] = 'chr' + table[chrom].astype(str) # select chroms if chroms is not None: table = table[table[chrom].isin(chroms)].copy() # get chrom, pos, strand, context, if needed, parse from fasta genome_coords = get_strand_and_context( table=table, chrom=chrom, pos=pos, strand=strand, context=context, fasta_path=fasta_path, num_upstream_bases=num_upstream_bases, num_downstream_bases=num_downstream_bases) # calculate mc, cov, frac (last col) if mode == 'mc+cov': values = value_conversion_func(table, mc, cov) elif mode == 'mc+uc': values = value_conversion_func(table, mc, uc) elif mode == 'uc+cov': values = value_conversion_func(table, uc, cov) elif mode == 'mc_frac+cov': values = value_conversion_func(table, mc_frac, cov) elif mode == 'mc_frac+mc': values = value_conversion_func(table, mc_frac, mc) elif mode == 'mc_frac+uc': values = value_conversion_func(table, mc_frac, uc) elif mode == 'mc_frac+pseudo_count': values = value_conversion_func(table, mc_frac, pseudo_count) else: # should have deal with error above raise ValueError # final allc table allc = pd.concat([genome_coords, values], axis=1) allc.columns = ['chrom', 'pos', 'strand', 'context', 'mc', 'cov', 'p'] return allc
@doc_params( table_to_allc_doc=table_to_allc_doc, input_path_doc=table_to_allc_input_path, output_prefix_doc=table_to_allc_output_prefix, sep_doc=table_to_allc_sep, header_doc=table_to_allc_header, chunk_size_doc=table_to_allc_chunk_size, chrom_doc=table_to_allc_chrom, pos_doc=table_to_allc_pos, strand_doc=table_to_allc_strand, context_doc=table_to_allc_context, mc_doc=table_to_allc_mc, uc_doc=table_to_allc_uc, cov_doc=table_to_allc_cov, mc_frac_doc=table_to_allc_mc_frac, pseudo_count_doc=table_to_allc_pseudo_count, fasta_path_doc=table_to_allc_fasta_path, num_upstream_bases_doc=table_to_allc_num_upstream_bases, num_downstream_bases_doc=table_to_allc_num_downstream_bases, add_chr_doc=table_to_allc_add_chr, sort_doc=table_to_allc_sort
[docs]) def table_to_allc( input_path, output_prefix, sep='\t', header=None, chunk_size=100000, chrom=0, pos=1, strand=None, context=None, mc=None, uc=None, cov=None, mc_frac=None, pseudo_count=1, fasta_path=None, num_upstream_bases=0, num_downstream_bases=2, add_chr=False, sort=True ): """\ {table_to_allc_doc} Parameters ---------- input_path {table_to_allc_input_path} output_prefix {table_to_allc_output_prefix} sep {table_to_allc_sep} header {table_to_allc_header} chunk_size {table_to_allc_chunk_size} chrom {table_to_allc_chrom} pos {table_to_allc_pos} strand {table_to_allc_strand} context {table_to_allc_context} mc {table_to_allc_mc} uc {table_to_allc_uc} cov {table_to_allc_cov} mc_frac {table_to_allc_mc_frac} pseudo_count {table_to_allc_pseudo_count} fasta_path {table_to_allc_fasta_path} num_upstream_bases {table_to_allc_num_upstream_bases} num_downstream_bases {table_to_allc_num_downstream_bases} add_chr {table_to_allc_add_chr} sort {table_to_allc_sort} Returns ------- output path of the converted ALLC file. """ unzip_path = f'{output_prefix}.allc.tsv' zip_path = f'{output_prefix}.allc.tsv.gz' with open(unzip_path, 'w') as out_f: chunks = pd.read_csv(input_path, chunksize=chunk_size, sep=sep, header=header) for chunk in chunks: # convert table chunks to allc allc_chunk = dataframe_to_allc( table=chunk, add_chr=add_chr, chrom=chrom, pos=pos, strand=strand, context=context, fasta_path=fasta_path, mc=mc, uc=uc, cov=cov, mc_frac=mc_frac, pseudo_count=pseudo_count, num_upstream_bases=num_upstream_bases, num_downstream_bases=num_downstream_bases) out_f.write(allc_chunk.to_csv(sep='\t', index=False, header=False)) # sort, bgzip, tabix allc try: if sort: subprocess.run( f'sort -k1,1 -k2,2n {unzip_path} -S 10G | ' # sort f'bgzip -c > {zip_path} && ' # bgzip f'tabix -f -b 2 -e 2 -s 1 {zip_path} && ' # tabix f'rm -f {unzip_path}', # remove uncompressed file shell=True, check=True, encoding='utf8', stderr=subprocess.PIPE) else: subprocess.run( f'bgzip -f {unzip_path} && ' # bgzip f'tabix -f -b 2 -e 2 -s 1 {zip_path}', # tabix shell=True, check=True, encoding='utf8', stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: print(e.stderr) raise e return zip_path