Source code for ALLCools.sandbox.motif.cmotif

import pathlib
import subprocess
from collections import defaultdict

import msgpack
import pandas as pd

from .fimo import _scan_motif_over_fasta, _aggregate_motif_beds
from ..dmr.get_fasta import get_fasta
from ..._bam_to_allc import _get_chromosome_sequence_upper, _read_faidx


[docs]def _generate_c_motif_database(motif_bed, fasta_path, output_dir, bin_size=10000000):
    fai_path = f'{fasta_path}.fai'
    fai_df = _read_faidx(fai_path)

    output_dir = pathlib.Path(output_dir).absolute()
    output_dir.mkdir()

    cur_name_id = 0
    motif_names = {}
    chrom_file_paths = []
    with open(motif_bed) as f:
        cur_chrom = ''
        full_chrom_seq = ''
        record_dict = defaultdict(list)
        for line in f:
            chrom, start, end, name, strand = line.strip('\n').split('\t')
            try:
                motif_int_id = motif_names[name]
            except KeyError:
                motif_names[name] = cur_name_id
                motif_int_id = cur_name_id
                cur_name_id += 1

            start = int(start)
            end = int(end)
            if chrom != cur_chrom:
                try:
                    full_chrom_seq = _get_chromosome_sequence_upper(fasta_path, fai_df, chrom)
                except KeyError:
                    continue
                # dump records and init
                if cur_chrom != '':
                    file_path = output_dir / f'{cur_chrom}.c_motif.msg'
                    chrom_file_paths.append(file_path)
                    with open(file_path, 'wb') as chr_f:
                        msgpack.dump(record_dict, chr_f)
                cur_chrom = chrom
                print(chrom)
                record_dict = defaultdict(list)

            motif = full_chrom_seq[start:end]
            if strand == '+':
                c_prefix = 1  # for forward strand
                g_prefix = 0  # for reverse strand
            else:
                c_prefix = 0
                g_prefix = 1
            for i_base, base in enumerate(motif):
                pos = start + i_base + 1
                if base == 'C':
                    record_dict[pos].append((c_prefix, motif_int_id))
                elif base == 'G':
                    record_dict[pos].append((g_prefix, motif_int_id))

        # dump records and init
        if cur_chrom != '':
            file_path = output_dir / f'{cur_chrom}.c_motif.msg'
            chrom_file_paths.append(file_path)
            with open(file_path, 'wb') as chr_f:
                msgpack.dump(record_dict, chr_f)
    with open(output_dir / 'MOTIF_NAMES.msg', 'wb') as f:
        msgpack.dump(motif_names, f)

    # split msg into chrom bins
    bin_paths = []
    for path in chrom_file_paths:
        bin_dict = defaultdict(dict)
        chrom = path.name.split('.')[0]
        print(chrom)
        with open(path, 'rb') as f:
            d = msgpack.unpackb(f.read(), raw=True, use_list=False)
            for k, v in d.items():
                nbin = k // bin_size
                bin_dict[nbin][k] = v
        for nbin, data in bin_dict.items():
            bin_path = output_dir / f'{chrom}.{int(nbin) * bin_size}-{int(nbin + 1) * bin_size}.c_motif.msg'
            bin_paths.append(bin_path)
            with open(bin_path, 'wb') as f:
                msgpack.dump(data, f)
        subprocess.run(['rm', '-f', path])

    # make lookup table
    records = []
    for path in bin_paths:
        record = {'file_name': path.name,
                  'chrom': path.name.split('.')[0],
                  'start': int(path.name.split('.')[1].split('-')[0]),
                  'end': int(path.name.split('.')[1].split('-')[1])}
        records.append(record)
    df = pd.DataFrame(records)[['chrom', 'start', 'end', 'file_name']]
    df.to_hdf(output_dir / 'LOOKUP_TABLE.hdf', key='data')
    return


[docs]def generate_cmotif_database(bed_file_paths,
                             reference_fasta,
                             motif_files,
                             output_dir,
                             slop_b=None,
                             chrom_size_path=None,
                             cpu=1,
                             sort_mem_gbs=5,
                             path_to_fimo='',
                             raw_score_thresh=8.,
                             raw_p_value_thresh=2e-4,
                             top_n=300000,
                             cmotif_bin_size=10000000):
    """\
    Generate lookup table for motifs all the cytosines belongs to.
    BED files are used to limit cytosine scan in certain regions.
    Scanning motif over whole genome is very noisy, better scan it in some functional part of genome.
    The result files will be in the output

    Parameters
    ----------
    bed_file_paths
        Paths of bed files. Multiple bed will be merged to get a final region set.
        The motif scan will only happen on the regions defined in these bed files.
    reference_fasta
        FASTA file path of the genome to scan
    motif_files
        MEME motif files that contains all the motif information.
    output_dir
        Output directory of C-Motif database
    slop_b
        Whether add slop to both ends of bed files.
    chrom_size_path
        {chrom_size_path_doc}
        Needed if slop_b is not None
    cpu
        {cpu_basic_doc}
    sort_mem_gbs
        Maximum memory usage in GBs when sort bed files
    path_to_fimo
        Path to fimo executable, if fimo is not in PATH
    raw_score_thresh
        Threshold of raw motif match likelihood score, see fimo doc for more info.
    raw_p_value_thresh
        Threshold of raw motif match P-value, see fimo doc for more info.
    top_n
        If too much motif found, will order them by likelihood score and keep top matches.
    cmotif_bin_size
        Bin size of single file in C-Motif database. No impact on results, better keep the default.
    Returns
    -------

    """
    try:
        if path_to_fimo != '':
            path_to_fimo = path_to_fimo.rstrip('/') + '/'
        subprocess.run([f'{path_to_fimo}fimo', '--version'], check=True)
    except subprocess.CalledProcessError as e:
        print('Test fimo failed. Make sure fimo (MEME suite) is installed and path_to_fimo set correctly.')
        raise e

    temp_paths = []

    output_dir = pathlib.Path(output_dir).absolute()
    bed_fasta_path = str(output_dir) + '.fa'
    temp_paths.append(bed_fasta_path)

    # step 1: Merge all bed files, get fasta file from the merged bed file.
    get_fasta(bed_file_paths=bed_file_paths,
              fasta_path=reference_fasta,
              output_path=bed_fasta_path,
              slop_b=slop_b,
              chrom_size_path=chrom_size_path,
              cpu=cpu, sort_mem_gbs=sort_mem_gbs)

    # step 2: scan motif over bed_fasta
    motif_scan_dir = str(output_dir) + '.motif_scan'
    temp_paths.append(motif_scan_dir)

    _scan_motif_over_fasta(
        meme_motif_file=motif_files,
        fasta_path=bed_fasta_path,
        cpu=cpu,
        output_dir=motif_scan_dir,
        path_to_fimo=path_to_fimo,
        raw_score_thresh=raw_score_thresh,
        raw_p_value_thresh=raw_p_value_thresh,
        top_n=top_n,
        is_genome_fasta=False)

    # step 3: aggregate motif bed
    total_motif_bed = str(output_dir) + '.total_motif.bed'
    lookup_table = pd.read_hdf(f'{motif_scan_dir}/LOOKUP_TABLE.hdf')
    bed_file_paths = lookup_table['output_file_path'].tolist()
    if len(bed_file_paths) == 0:
        print('None of the motif have any match in the current setting. Nothing to continue.'
              'Does the region to small or the threshold too stringent?')
        return
    _aggregate_motif_beds(bed_file_paths=bed_file_paths,
                          output_path=total_motif_bed,
                          cpu=cpu,
                          sort_mem_gbs=sort_mem_gbs)

    # step 4: generate cmotif dataset
    _generate_c_motif_database(motif_bed=total_motif_bed,
                               fasta_path=reference_fasta,
                               output_dir=output_dir,
                               bin_size=cmotif_bin_size)

    # step 5: cleaning
    for path in temp_paths:
        subprocess.run(['rm', '-rf', path])
    # not delete this file, but gzip it.
    subprocess.run(['bgzip', total_motif_bed])
    # save motif files into output_dir
    motif_file_dir = output_dir / 'MOTIF_FILE'
    motif_file_dir.mkdir(exist_ok=True)
    for motif_file in motif_files:
        new_path = motif_file_dir / pathlib.Path(motif_file).name
        subprocess.run(['cp', motif_file, new_path], check=True)
    return