Source code for ALLCools._doc

from textwrap import dedent

[docs]idx_doc = (
    "If true, save an methylpy chromosome index for back compatibility. "
    "If you only use methylpy to call DMR, this don't need to be True."
)

[docs]allc_path_doc = "Path to 1 ALLC file"

[docs]allc_paths_doc = (
    "Single ALLC path contain wildcard OR multiple space separated ALLC paths "
    "OR a file contains 1 ALLC path in each row."
)

[docs]allc_table_doc = (
    "Contain all the ALLC file information in two tab-separated columns: "
    "1. file_uid, 2. file_path. No header"
)

[docs]binarize_doc = (
    "If set, binarize each single site in each individual ALLC file. "
    "This means each cytosine will only contribute at most 1 cov and 0/1 mc, "
    "this is suitable to account for single cell ALLC R1 R2 overlap issue, "
    "Only use this on single cell ALLC, not bulk ALLC."
)

[docs]bin_sizes_doc = (
    "Fix-size genomic bins can be defined by bin_sizes and chrom_size_path. "
    "Space separated sizes of genome bins, each size will be count separately."
)

[docs]bw_bin_sizes_doc = "Bin size of the BigWig files."

[docs]chrom_size_path_doc = (
    "Path to UCSC chrom size file. "
    "This can be generated from the genome fasta or downloaded via UCSC fetchChromSizes tools. "
    "All ALLCools functions will refer to this file whenever possible to check for "
    "chromosome names and lengths, so it is crucial to use a chrom size file consistent "
    "to the reference fasta file ever since mapping. "
    "ALLCools functions will not change or infer chromosome names."
)

[docs]compress_level_doc = "Compression level for the output file"

[docs]cov_cutoff_doc = "Max cov filter for a single site in ALLC. Sites with cov > cov_cutoff will be skipped."

[docs]cpu_basic_doc = "Number of processes to use in parallel."

[docs]mc_contexts_doc = (
    "Space separated mC context patterns to extract from ALLC. "
    "The context length should be the same as ALLC file context. "
    "Context pattern follows IUPAC nucleotide code, e.g. N for ATCG, H for ATC, Y for CT."
)

[docs]mc_context_mcad_doc = (
    "mC context pattern to extract from ALLC. "
    "Context pattern follows IUPAC nucleotide code, e.g. N for ATCG, H for ATC, Y for CT."
    "Note that generate_mcad only take one mC context"
)

[docs]reference_fasta_doc = (
    "Path to 1 genome reference FASTA file (the one used for mapping), "
    "use samtools fadix to build .fai index first. Do not compress that file."
)

[docs]region_bed_names_doc = (
    "Space separated names for each BED file provided in region_bed_paths."
)

[docs]region_bed_paths_doc = (
    "Arbitrary genomic regions can be defined in several BED files to count on. "
    "Space separated paths to each BED files, "
    "The fourth column of the BED file should be unique id of the regions."
)

[docs]region_bed_path_mcad_doc = (
    "Arbitrary genomic regions can be defined in one BED file to count on. "
    "The fourth column of the BED file should be unique id of the regions."
)

[docs]region_doc = (
    "Only extract records from certain genome region(s) via tabix, "
    "multiple region can be provided in tabix form. If region is not None, will not run in parallel"
)

[docs]remove_additional_chrom_doc = (
    "Whether to remove rows with unknown chromosome instead of raising KeyError"
)

[docs]rna_table_doc = (
    "This is only for mCT data when we have RNA BAM file for each single cell. "
    "Contain all the RNA BAM file information in 2 columns: 1. file_uid, 2. file_path. No header."
)

[docs]snp_doc = "If true, means the input allc contain snp information, and the allc processing will take care that."

[docs]split_strand_doc = "If true, Watson (+) and Crick (-) strands will be count separately"

[docs]strandness_doc = (
    "What to do with strand information, possible values are: "
    "1. both: save +/- strand together in one file; "
    "2. split: save +/- strand into two separate files, with suffix contain Watson (+) and Crick (-); "
    "3. merge: This will only merge the count on adjacent CpG in +/- strands, "
    "only work for CpG like context. For non-CG context, its the same as both."
)

[docs]generate_dataset_doc = (
    "Generate MCDS dataset from a list of ALLC files (recorded in the allc_table). "
    "Multiple region sets, methylation contexts and quantification types can be included in one command."
)

[docs]generate_dataset_obs_dim_doc = 'Name of the observation dimension.'

[docs]generate_dataset_chunk_size_doc = 'Chunk allc_table with chunk_size when generate dataset in parallel'

[docs]generate_dataset_regions_doc = (
    'Definition of genomic regions in the form of "--regions {region_name} {region_definition}". '
    'This parameter can be specified multiple times, to allow quantification of multiple region sets '
    'in the same MCDS dataset. Several cases are allowed: '
    '1) a integer number means fix-sized genomic bins, region bed and region id will be generated '
    'automatically based on the chrom_size_path parameter (e.g., "--regions chrom100k 100000"); '
    '2) a path to a three-column bed file, in this case, '
    'a forth column containing region id in the form of {region_name}_{i} will be added automatically '
    '(e.g., "--regions gene /path/to/gene_bed_no_id.bed", '
    'where the bed file only has chrom, start, end columns); '
    '3) a path to a four-column bed file, in this case, the forth column will be treated as region id '
    'and the region ids must be UNIQUE. (e.g., "--regions gene /path/to/gene_bed_with_id.bed", '
    'where the bed file has chrom, start, end, id columns).'
)

[docs]generate_dataset_quantifiers_doc = (
    'Definition of genome region quantifiers in the form of '
    '"--quantifiers {region_name} {quant_type} {mc_contexts} {optional_parameter}". '
    'The region_name determines which region set this quantifier applies to, '
    'region_name must be defined by "--regions" parameter. '
    'The quant_type specify which quantifiers, it must be in ["count", "hypo-score", "hyper-score"]. '
    'The mc_contexts specify a comma separated mC context list, '
    'it must be the same size as the ALLC table, and uses IUPAC base abbreviation. '
    '--quantifiers parameter can be specified multiple times, '
    'to allow different quantification for different region sets, '
    'or multiple quantification for the same region set. '
    'Some examples: '
    '1) To quantify raw counts of a region set in mCG and mCH context: '
    '"--quantifiers gene count CGN,CHN" '
    '2) To quantify the mCG hypo-methylation score of chrom 5Kb bins: '
    '"--quantifiers chrom5k hypo-score CGN cutoff=0.9", '
    'by default, cutoff=0.9, so the last part is optional. '
    '3) To ALSO quantify the mCG raw counts of chrom 5Kb bins in the same MCDS, '
    'just specify another quantifiers in the same command: '
    '"--quantifiers chrom5k count CGN", note the count matrix of chrom5k will be large. '
    'Its not usually needed, but you have the option if needed.'
)

[docs]table_to_allc_doc = 'Convert different kinds of methylation table into ALLC format. ' \
                    'Currently, only plain text table is accepted.'
[docs]table_to_allc_input_path = 'input path of the table'
[docs]table_to_allc_output_prefix = 'output prefix of the ALLC table'
[docs]table_to_allc_sep = 'character to separate columns in the table'
[docs]table_to_allc_header = 'Whether the table contains header line or not'
[docs]table_to_allc_chunk_size = 'chunk_size to perform conversion'
[docs]table_to_allc_chrom = 'the chromosome column number, 0-based index'
[docs]table_to_allc_pos = 'the position column number, 0-based index'
[docs]table_to_allc_strand = 'the strand column number, 0-based index. ' \
                       'If not provided, will infer automatically based on the fasta_path'
[docs]table_to_allc_context = 'the cytosine context column number, 0-based index. ' \
                        'If not provided, will inter automatically based on the fasta_path'
[docs]table_to_allc_mc = 'the methylated cytosine count column number, 0-based index.'
[docs]table_to_allc_uc = 'the unmethylated cytosine count column number, 0-based index.'
[docs]table_to_allc_cov = 'the total cytosine coverage count column number, 0-based index.'
[docs]table_to_allc_mc_frac = 'the methylation fraction column number, 0-based index.'
[docs]table_to_allc_pseudo_count = 'Use this pseudo_count number as the total cytosine coverage count, ' \
                             'if the "cov" column is missing and "mc_frac" column is provided.'
[docs]table_to_allc_fasta_path = 'the genome FASTA file path, ' \
                           'required if either "strand" or "context" column is missing.'
[docs]table_to_allc_num_upstream_bases = 'number of up stream bases to include when get cytosine context.'
[docs]table_to_allc_num_downstream_bases = 'number of down stream bases to include when get cytosine context.'
[docs]table_to_allc_add_chr = 'whether add "chr" before the chromosome name.'
[docs]table_to_allc_sort = 'whether sort the ALLC table after conversion.'


[docs]def doc_params(**kwds):
    """\
    Docstrings should start with "\" in the first line for proper formatting.
    """

    def dec(obj):
        obj.__doc__ = dedent(obj.__doc__).format(**kwds)
        return obj

    return dec