Source code for ALLCools.sandbox.motif.cmotif

import pathlib
import subprocess
from collections import defaultdict

import msgpack
import pandas as pd

from .fimo import _scan_motif_over_fasta, _aggregate_motif_beds
from ..dmr.get_fasta import get_fasta
from ..._bam_to_allc import _get_chromosome_sequence_upper, _read_faidx


[docs]def _generate_c_motif_database(motif_bed, fasta_path, output_dir, bin_size=10000000): fai_path = f'{fasta_path}.fai' fai_df = _read_faidx(fai_path) output_dir = pathlib.Path(output_dir).absolute() output_dir.mkdir() cur_name_id = 0 motif_names = {} chrom_file_paths = [] with open(motif_bed) as f: cur_chrom = '' full_chrom_seq = '' record_dict = defaultdict(list) for line in f: chrom, start, end, name, strand = line.strip('\n').split('\t') try: motif_int_id = motif_names[name] except KeyError: motif_names[name] = cur_name_id motif_int_id = cur_name_id cur_name_id += 1 start = int(start) end = int(end) if chrom != cur_chrom: try: full_chrom_seq = _get_chromosome_sequence_upper(fasta_path, fai_df, chrom) except KeyError: continue # dump records and init if cur_chrom != '': file_path = output_dir / f'{cur_chrom}.c_motif.msg' chrom_file_paths.append(file_path) with open(file_path, 'wb') as chr_f: msgpack.dump(record_dict, chr_f) cur_chrom = chrom print(chrom) record_dict = defaultdict(list) motif = full_chrom_seq[start:end] if strand == '+': c_prefix = 1 # for forward strand g_prefix = 0 # for reverse strand else: c_prefix = 0 g_prefix = 1 for i_base, base in enumerate(motif): pos = start + i_base + 1 if base == 'C': record_dict[pos].append((c_prefix, motif_int_id)) elif base == 'G': record_dict[pos].append((g_prefix, motif_int_id)) # dump records and init if cur_chrom != '': file_path = output_dir / f'{cur_chrom}.c_motif.msg' chrom_file_paths.append(file_path) with open(file_path, 'wb') as chr_f: msgpack.dump(record_dict, chr_f) with open(output_dir / 'MOTIF_NAMES.msg', 'wb') as f: msgpack.dump(motif_names, f) # split msg into chrom bins bin_paths = [] for path in chrom_file_paths: bin_dict = defaultdict(dict) chrom = path.name.split('.')[0] print(chrom) with open(path, 'rb') as f: d = msgpack.unpackb(f.read(), raw=True, use_list=False) for k, v in d.items(): nbin = k // bin_size bin_dict[nbin][k] = v for nbin, data in bin_dict.items(): bin_path = output_dir / f'{chrom}.{int(nbin) * bin_size}-{int(nbin + 1) * bin_size}.c_motif.msg' bin_paths.append(bin_path) with open(bin_path, 'wb') as f: msgpack.dump(data, f) subprocess.run(['rm', '-f', path]) # make lookup table records = [] for path in bin_paths: record = {'file_name': path.name, 'chrom': path.name.split('.')[0], 'start': int(path.name.split('.')[1].split('-')[0]), 'end': int(path.name.split('.')[1].split('-')[1])} records.append(record) df = pd.DataFrame(records)[['chrom', 'start', 'end', 'file_name']] df.to_hdf(output_dir / 'LOOKUP_TABLE.hdf', key='data') return
[docs]def generate_cmotif_database(bed_file_paths, reference_fasta, motif_files, output_dir, slop_b=None, chrom_size_path=None, cpu=1, sort_mem_gbs=5, path_to_fimo='', raw_score_thresh=8., raw_p_value_thresh=2e-4, top_n=300000, cmotif_bin_size=10000000): """\ Generate lookup table for motifs all the cytosines belongs to. BED files are used to limit cytosine scan in certain regions. Scanning motif over whole genome is very noisy, better scan it in some functional part of genome. The result files will be in the output Parameters ---------- bed_file_paths Paths of bed files. Multiple bed will be merged to get a final region set. The motif scan will only happen on the regions defined in these bed files. reference_fasta FASTA file path of the genome to scan motif_files MEME motif files that contains all the motif information. output_dir Output directory of C-Motif database slop_b Whether add slop to both ends of bed files. chrom_size_path {chrom_size_path_doc} Needed if slop_b is not None cpu {cpu_basic_doc} sort_mem_gbs Maximum memory usage in GBs when sort bed files path_to_fimo Path to fimo executable, if fimo is not in PATH raw_score_thresh Threshold of raw motif match likelihood score, see fimo doc for more info. raw_p_value_thresh Threshold of raw motif match P-value, see fimo doc for more info. top_n If too much motif found, will order them by likelihood score and keep top matches. cmotif_bin_size Bin size of single file in C-Motif database. No impact on results, better keep the default. Returns ------- """ try: if path_to_fimo != '': path_to_fimo = path_to_fimo.rstrip('/') + '/' subprocess.run([f'{path_to_fimo}fimo', '--version'], check=True) except subprocess.CalledProcessError as e: print('Test fimo failed. Make sure fimo (MEME suite) is installed and path_to_fimo set correctly.') raise e temp_paths = [] output_dir = pathlib.Path(output_dir).absolute() bed_fasta_path = str(output_dir) + '.fa' temp_paths.append(bed_fasta_path) # step 1: Merge all bed files, get fasta file from the merged bed file. get_fasta(bed_file_paths=bed_file_paths, fasta_path=reference_fasta, output_path=bed_fasta_path, slop_b=slop_b, chrom_size_path=chrom_size_path, cpu=cpu, sort_mem_gbs=sort_mem_gbs) # step 2: scan motif over bed_fasta motif_scan_dir = str(output_dir) + '.motif_scan' temp_paths.append(motif_scan_dir) _scan_motif_over_fasta( meme_motif_file=motif_files, fasta_path=bed_fasta_path, cpu=cpu, output_dir=motif_scan_dir, path_to_fimo=path_to_fimo, raw_score_thresh=raw_score_thresh, raw_p_value_thresh=raw_p_value_thresh, top_n=top_n, is_genome_fasta=False) # step 3: aggregate motif bed total_motif_bed = str(output_dir) + '.total_motif.bed' lookup_table = pd.read_hdf(f'{motif_scan_dir}/LOOKUP_TABLE.hdf') bed_file_paths = lookup_table['output_file_path'].tolist() if len(bed_file_paths) == 0: print('None of the motif have any match in the current setting. Nothing to continue.' 'Does the region to small or the threshold too stringent?') return _aggregate_motif_beds(bed_file_paths=bed_file_paths, output_path=total_motif_bed, cpu=cpu, sort_mem_gbs=sort_mem_gbs) # step 4: generate cmotif dataset _generate_c_motif_database(motif_bed=total_motif_bed, fasta_path=reference_fasta, output_dir=output_dir, bin_size=cmotif_bin_size) # step 5: cleaning for path in temp_paths: subprocess.run(['rm', '-rf', path]) # not delete this file, but gzip it. subprocess.run(['bgzip', total_motif_bed]) # save motif files into output_dir motif_file_dir = output_dir / 'MOTIF_FILE' motif_file_dir.mkdir(exist_ok=True) for motif_file in motif_files: new_path = motif_file_dir / pathlib.Path(motif_file).name subprocess.run(['cp', motif_file, new_path], check=True) return