Source code for ALLCools.dmr.call_dmr

import xarray as xr
import numpy as np
import pandas as pd
from ..mcds import RegionDS
from ..mcds.utilities import write_ordered_chunks, update_dataset_config
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed


[docs]def _call_dmr_single_chrom(
    dms_dir,
    output_dir,
    chrom,
    p_value_cutoff=0.001,
    frac_delta_cutoff=0.2,
    max_dist=250,
    residual_quantile=0.6,
    corr_cutoff=0.3,
    dms_ratio=0.8,
):
    """Call DMR for single chromosome, see call_dmr for doc"""

    ds = RegionDS.open(dms_dir, region_dim="dms")

    # open DMS dataset and select single chromosome, significant, large delta DMS
    p_value_judge = ds.coords["dms_p-values"] < p_value_cutoff
    frac_delta = ds["dms_da_frac"].max(dim="sample") - ds["dms_da_frac"].min(
        dim="sample"
    )
    frac_delta_judge = frac_delta > frac_delta_cutoff
    chrom_judge = ds["dms_chrom"] == chrom
    # has to use load(scheduler='sync') when this function is called from multiprocess
    final_judge = (
        (p_value_judge & frac_delta_judge & chrom_judge)
        .load(scheduler="sync")
        .to_pandas()
    )
    if final_judge.sum() == 0:
        # No DMS remain
        return

    use_dms = final_judge[final_judge].index
    ds = ds.sel(dms=use_dms).load(scheduler="sync")

    # step 1: combine raw DMR windows based on distance
    dms_dist = ds["dms_pos"][1:].values - ds["dms_pos"][:-1].values
    dist_judge = dms_dist < max_dist
    cur_dmr = 0
    dmr_ids = [0]
    for i in dist_judge:
        if not i:
            cur_dmr += 1
        dmr_ids.append(cur_dmr)
    dmr_ids = pd.Series(dmr_ids, index=ds.get_index("dms"))

    # step 2: determine correlation between adjacent DMS
    data = ds["dms_da_frac"].transpose("dms", "sample").to_pandas()
    a = data.iloc[:-1, :].reset_index(drop=True)
    b = data.iloc[1:, :].reset_index(drop=True)
    # index of corr means the correlation of that DMS with the previous DMS
    # regardless of the genome distance
    # fill na value with 1, tend to not to split DMR due to nan
    corrs = a.fillna(1).corrwith(b.fillna(1), axis=1, method="pearson")
    corrs.index = data.iloc[1:, :].index.copy()

    # step 3: recalculate DMR windows based on both distance and correlation
    raw_dmr_table = pd.DataFrame({"dmr": dmr_ids, "corr": corrs})
    dmr_dict = {}
    cur_dmr_id = 0
    for dmr_id, sub_df in raw_dmr_table.groupby("dmr"):
        dmr_dict[sub_df.index[0]] = cur_dmr_id
        if sub_df.shape[0] > 1:
            for dms_id, corr in sub_df["corr"][1:].items():
                if corr > corr_cutoff:
                    dmr_dict[dms_id] = cur_dmr_id
                else:
                    cur_dmr_id += 1
                    dmr_dict[dms_id] = cur_dmr_id
        cur_dmr_id += 1
    dmr_ids = pd.Series(dmr_dict)
    dmr_ids.index.name = "dms"
    ds.coords["dmr"] = dmr_ids

    # step 4: determine sample hypo or hyper in each DMS and DMR based on residual
    if residual_quantile < 0.5:
        residual_quantile = 1 - residual_quantile

    residual_low_cutoff, residual_high_cutoff = np.nanquantile(
        ds["dms_residual"], [1 - residual_quantile, residual_quantile]
    )

    lower_residual = ds["dms_residual"] < residual_low_cutoff
    higher_residual = ds["dms_residual"] > residual_high_cutoff
    # for each sample in each DMS
    # -1 means hypo methylation, 1 means hyper methylation, 0 mean no sig change
    dms_states = lower_residual.astype(int) * -1 + higher_residual.astype(int)
    # dmr state judge
    # use mean to calculate the overall DMR state from DMS
    dmr_states = dms_states.groupby("dmr").mean()
    # then mask inconsistent DMR states with the dms_ratio cutoff
    dmr_states = xr.where(np.abs(dmr_states) < dms_ratio, 0, dmr_states)
    # then "round" the dmr_states, so it only contains -1, 0, 1
    dmr_states = xr.where(dmr_states > 0, 1, dmr_states)
    dmr_states = xr.where(dmr_states < 0, -1, dmr_states)

    # step 5: prepare dmr counts and fractions
    dmr_da = ds["dms_da"].groupby("dmr").sum()
    dmr_frac = dmr_da.sel(count_type="mc") / dmr_da.sel(count_type="cov")
    dmr_ds = xr.Dataset(
        {
            "dmr_da": dmr_da.astype(np.uint32),
            "dmr_state": dmr_states.astype(np.int8),
            "dmr_da_frac": dmr_frac.astype(np.float32),
        }
    )

    # add n dms counts
    n_dms = dmr_ids.value_counts().sort_index()
    n_dms.index.name = "dmr"
    dmr_ds.coords["ndms"] = n_dms
    # add genome coords
    dmr_ds.coords["chrom"] = pd.Series(
        [chrom for _ in range(dmr_ds.get_index("dmr").size)],
        index=dmr_ds.get_index("dmr"),
    )
    dmr_ds.coords["start"] = ds["dms_pos"].groupby(ds["dmr"]).min().to_pandas() - 1
    dmr_ds.coords["end"] = ds["dms_pos"].groupby(ds["dmr"]).max().to_pandas() + 1
    dmr_ds.coords["length"] = dmr_ds.coords["end"] - dmr_ds.coords["start"]
    dmr_ds.coords["dmr"] = (
        dmr_ds["chrom"].to_pandas().astype(str)
        + "-"
        + dmr_ds["dmr"].to_pandas().astype(str)
    )

    # rename none dimensional coords to prevent collision when merge with other ds
    dmr_ds = dmr_ds.rename(
        {k: f"dmr_{k}" for k in dmr_ds.coords.keys() if k not in dmr_ds.dims}
    )

    output_path = f"{output_dir}/{chrom}.zarr"
    dmr_ds.to_zarr(output_path)
    return output_path


[docs]def call_dmr(
    output_dir,
    replicate_label=None,
    p_value_cutoff=0.001,
    frac_delta_cutoff=0.2,
    max_dist=250,
    residual_quantile=0.6,
    corr_cutoff=0.3,
    dms_ratio=0.8,
    cpu=1,
    chrom=None,
):
    """
    Call DMR from DMS results.

    Parameters
    ----------
    output_dir
    replicate_label
    p_value_cutoff
    frac_delta_cutoff
    max_dist
    residual_quantile
    corr_cutoff
    dms_ratio
    cpu
    chrom

    Returns
    -------

    """
    if chrom is None:
        chrom_size_path = f"{output_dir}/chrom_sizes.txt"
        from ..utilities import parse_chrom_size

        chroms = parse_chrom_size(chrom_size_path).keys()
    else:
        if isinstance(chrom, list):
            chroms = chrom
        else:
            chroms = [chrom]

    chunk_dir = f"{output_dir}/.dmr_chunk"
    dmr_dir = f"{output_dir}/dmr"

    with ProcessPoolExecutor(cpu) as exe:
        futures = {}
        for chunk_i, chrom in enumerate(chroms):
            future = exe.submit(
                _call_dmr_single_chrom,
                dms_dir=f"{output_dir}/dms",
                output_dir=chunk_dir,
                chrom=chrom,
                p_value_cutoff=p_value_cutoff,
                frac_delta_cutoff=frac_delta_cutoff,
                max_dist=max_dist,
                residual_quantile=residual_quantile,
                corr_cutoff=corr_cutoff,
                dms_ratio=dms_ratio,
            )
            futures[future] = chunk_i

        chunks_to_write = {}
        for future in as_completed(futures):
            chunk_i = futures[future]
            output_path = future.result()
            chunks_to_write[chunk_i] = output_path

    write_ordered_chunks(
        chunks_to_write,
        final_path=dmr_dir,
        append_dim="dmr",
        engine="zarr",
        coord_dtypes=None,
    )
    subprocess.run(f"rm -rf {chunk_dir}", shell=True)

    update_dataset_config(output_dir=output_dir,
                          add_ds_region_dim={"dmr": "dmr"},
                          change_region_dim="dmr")

    if replicate_label is not None:
        collapse_replicates(output_dir, replicate_label)
    return


[docs]def collapse_replicates(region_ds, replicate_label, state_da="dmr_state"):
    if not isinstance(replicate_label, str):
        # assume the replicate label is provided as a series or dataarray
        # that can be added into coords
        region_ds.coords["replicate_label"] = replicate_label
        replicate_label_name = "replicate_label"
    else:
        replicate_label_name = replicate_label

    # check if label exist and complete
    assert replicate_label_name in region_ds.coords
    assert region_ds.coords[replicate_label_name].to_pandas().isna().sum() == 0
    sample_dim, *_ = region_ds.coords[replicate_label_name].dims

    collapsed = {}
    for rep, sub_da in region_ds[state_da].groupby(replicate_label_name):
        std = sub_da.std(dim=sample_dim)
        # std == 0 means all replicates having same state
        reduced_da = xr.where(
            std == 0, sub_da.sel({sample_dim: sub_da[sample_dim][0]}), 0
        ).to_pandas()
        collapsed[rep] = reduced_da
    collapsed = pd.DataFrame(collapsed)
    collapsed.columns.name = f"{sample_dim}_collapsed"
    collapsed.index.name = region_ds.region_dim

    region_ds[f"{state_da}_collapsed"] = collapsed
    if region_ds.location is not None:
        region_ds[[f"{state_da}_collapsed"]].to_zarr(
            f"{region_ds.location}/{region_ds.region_dim}/", mode="a"
        )
        print(f"Collapsed sample state added in exist RegionDS at {region_ds.location}")
    return