import re
import numpy as np
import pandas as pd
import ALLCools
from .motifs import Motif, MotifSet
[docs]PACKAGE_DIR = ALLCools.__path__[0]
[docs]DEFAULT_MOTIF_DIR = f"{PACKAGE_DIR}/motif/default_motif_set/"
[docs]N_SITES_PATTERN = re.compile(r"(?<=nsites= )\d+")
[docs]SPACE_PATTERN = re.compile(r"[ \t]")
[docs]def parse_motif(lines, alphabet, background):
motif_name = lines[0][5:].strip()
txt = lines[1].split(":")[1].strip()
_m = N_SITES_PATTERN.search(txt)
n_sites = int(_m[0]) if _m is not None else np.NaN
_data = []
for _line in lines[2:]:
try:
values = SPACE_PATTERN.split(_line.strip())
values = [v.strip() for v in values if v.strip() != ""]
_data.append(list(map(float, values)))
except ValueError:
# some additional rows that are not PFM
continue
_data = np.array(_data)
counts = np.round(_data * n_sites).astype(int)
try:
counts = {base: list(counts[:, i]) for i, base in enumerate(alphabet)}
except IndexError:
print(lines)
raise
_motif = Motif(alphabet=alphabet, counts=counts)
_motif.background = background
_motif.name = motif_name
_motif.pseudocounts = 0.5
return _motif
[docs]def parse_meme_database(meme_path, meta_path):
total_motifs = {}
header = True
header_lines = []
motif_lines = []
alphabet, background = None, None
with open(meme_path) as f:
for line in f:
if line[:5].upper() == "MOTIF":
alphabet, background = parse_header_lines(header_lines)
header = False
if header:
header_lines.append(line)
continue
if line.strip() == "":
continue
if (not header) and line[:5].upper() == "MOTIF":
if len(motif_lines) > 0:
motif = parse_motif(
motif_lines, alphabet=alphabet, background=background
)
if motif.name in total_motifs:
raise ValueError
else:
total_motifs[motif.name] = motif
motif_lines = [line]
else:
motif_lines = [line]
else:
motif_lines.append(line)
# last motif
if len(motif_lines) > 1:
motif = parse_motif(motif_lines, alphabet=alphabet, background=background)
if motif.name in total_motifs:
raise ValueError
else:
total_motifs[motif.name] = motif
meta_table = pd.read_csv(meta_path, index_col=0)
motif_set = MotifSet(list(total_motifs.values()), meta_table=meta_table)
return motif_set
[docs]def get_default_motif_set(database="three_databases"):
if database == "three_databases":
motif_set = parse_meme_database(
f"{DEFAULT_MOTIF_DIR}/JASPAR2018HOCOMOCOv11Jolma2013.meme",
f"{DEFAULT_MOTIF_DIR}/JASPAR2018HOCOMOCOv11Jolma2013.metadata.csv",
)
# default thresholds, fnr/fpr = 1000
motif_set.thresholds = pd.read_csv(
f"{DEFAULT_MOTIF_DIR}/JASPAR2018HOCOMOCOv11Jolma2013.thresholds.csv",
header=None,
index_col=0,
squeeze=True,
).to_dict()
return motif_set
else:
# TODO: allow user create motif set by providing meme and metadata (optional)
raise NotImplementedError