from textwrap import dedent
[docs]idx_doc = (
"If true, save an methylpy chromosome index for back compatibility. "
"If you only use methylpy to call DMR, this don't need to be True."
)
[docs]allc_path_doc = "Path to 1 ALLC file"
[docs]allc_paths_doc = (
"Single ALLC path contain wildcard OR multiple space separated ALLC paths "
"OR a file contains 1 ALLC path in each row."
)
[docs]allc_table_doc = (
"Contain all the ALLC file information in two tab-separated columns: "
"1. file_uid, 2. file_path. No header"
)
[docs]binarize_doc = (
"If set, binarize each single site in each individual ALLC file. "
"This means each cytosine will only contribute at most 1 cov and 0/1 mc, "
"this is suitable to account for single cell ALLC R1 R2 overlap issue, "
"Only use this on single cell ALLC, not bulk ALLC."
)
[docs]bin_sizes_doc = (
"Fix-size genomic bins can be defined by bin_sizes and chrom_size_path. "
"Space separated sizes of genome bins, each size will be count separately."
)
[docs]bw_bin_sizes_doc = "Bin size of the BigWig files."
[docs]chrom_size_path_doc = (
"Path to UCSC chrom size file. "
"This can be generated from the genome fasta or downloaded via UCSC fetchChromSizes tools. "
"All ALLCools functions will refer to this file whenever possible to check for "
"chromosome names and lengths, so it is crucial to use a chrom size file consistent "
"to the reference fasta file ever since mapping. "
"ALLCools functions will not change or infer chromosome names."
)
[docs]compress_level_doc = "Compression level for the output file"
[docs]cov_cutoff_doc = "Max cov filter for a single site in ALLC. Sites with cov > cov_cutoff will be skipped."
[docs]cpu_basic_doc = "Number of processes to use in parallel."
[docs]mc_contexts_doc = (
"Space separated mC context patterns to extract from ALLC. "
"The context length should be the same as ALLC file context. "
"Context pattern follows IUPAC nucleotide code, e.g. N for ATCG, H for ATC, Y for CT."
)
[docs]mc_context_mcad_doc = (
"mC context pattern to extract from ALLC. "
"Context pattern follows IUPAC nucleotide code, e.g. N for ATCG, H for ATC, Y for CT."
"Note that generate_mcad only take one mC context"
)
[docs]reference_fasta_doc = (
"Path to 1 genome reference FASTA file (the one used for mapping), "
"use samtools fadix to build .fai index first. Do not compress that file."
)
[docs]region_bed_names_doc = (
"Space separated names for each BED file provided in region_bed_paths."
)
[docs]region_bed_paths_doc = (
"Arbitrary genomic regions can be defined in several BED files to count on. "
"Space separated paths to each BED files, "
"The fourth column of the BED file should be unique id of the regions."
)
[docs]region_bed_path_mcad_doc = (
"Arbitrary genomic regions can be defined in one BED file to count on. "
"The fourth column of the BED file should be unique id of the regions."
)
[docs]region_doc = (
"Only extract records from certain genome region(s) via tabix, "
"multiple region can be provided in tabix form. If region is not None, will not run in parallel"
)
[docs]remove_additional_chrom_doc = (
"Whether to remove rows with unknown chromosome instead of raising KeyError"
)
[docs]rna_table_doc = (
"This is only for mCT data when we have RNA BAM file for each single cell. "
"Contain all the RNA BAM file information in 2 columns: 1. file_uid, 2. file_path. No header."
)
[docs]snp_doc = "If true, means the input allc contain snp information, and the allc processing will take care that."
[docs]split_strand_doc = "If true, Watson (+) and Crick (-) strands will be count separately"
[docs]strandness_doc = (
"What to do with strand information, possible values are: "
"1. both: save +/- strand together in one file; "
"2. split: save +/- strand into two separate files, with suffix contain Watson (+) and Crick (-); "
"3. merge: This will only merge the count on adjacent CpG in +/- strands, "
"only work for CpG like context. For non-CG context, its the same as both."
)
[docs]generate_dataset_doc = (
"Generate MCDS dataset from a list of ALLC files (recorded in the allc_table). "
"Multiple region sets, methylation contexts and quantification types can be included in one command."
)
[docs]generate_dataset_obs_dim_doc = 'Name of the observation dimension.'
[docs]generate_dataset_chunk_size_doc = 'Chunk allc_table with chunk_size when generate dataset in parallel'
[docs]generate_dataset_regions_doc = (
'Definition of genomic regions in the form of "--regions {region_name} {region_definition}". '
'This parameter can be specified multiple times, to allow quantification of multiple region sets '
'in the same MCDS dataset. Several cases are allowed: '
'1) a integer number means fix-sized genomic bins, region bed and region id will be generated '
'automatically based on the chrom_size_path parameter (e.g., "--regions chrom100k 100000"); '
'2) a path to a three-column bed file, in this case, '
'a forth column containing region id in the form of {region_name}_{i} will be added automatically '
'(e.g., "--regions gene /path/to/gene_bed_no_id.bed", '
'where the bed file only has chrom, start, end columns); '
'3) a path to a four-column bed file, in this case, the forth column will be treated as region id '
'and the region ids must be UNIQUE. (e.g., "--regions gene /path/to/gene_bed_with_id.bed", '
'where the bed file has chrom, start, end, id columns).'
)
[docs]generate_dataset_quantifiers_doc = (
'Definition of genome region quantifiers in the form of '
'"--quantifiers {region_name} {quant_type} {mc_contexts} {optional_parameter}". '
'The region_name determines which region set this quantifier applies to, '
'region_name must be defined by "--regions" parameter. '
'The quant_type specify which quantifiers, it must be in ["count", "hypo-score", "hyper-score"]. '
'The mc_contexts specify a comma separated mC context list, '
'it must be the same size as the ALLC table, and uses IUPAC base abbreviation. '
'--quantifiers parameter can be specified multiple times, '
'to allow different quantification for different region sets, '
'or multiple quantification for the same region set. '
'Some examples: '
'1) To quantify raw counts of a region set in mCG and mCH context: '
'"--quantifiers gene count CGN,CHN" '
'2) To quantify the mCG hypo-methylation score of chrom 5Kb bins: '
'"--quantifiers chrom5k hypo-score CGN cutoff=0.9", '
'by default, cutoff=0.9, so the last part is optional. '
'3) To ALSO quantify the mCG raw counts of chrom 5Kb bins in the same MCDS, '
'just specify another quantifiers in the same command: '
'"--quantifiers chrom5k count CGN", note the count matrix of chrom5k will be large. '
'Its not usually needed, but you have the option if needed.'
)
[docs]table_to_allc_doc = 'Convert different kinds of methylation table into ALLC format. ' \
'Currently, only plain text table is accepted.'
[docs]table_to_allc_output_prefix = 'output prefix of the ALLC table'
[docs]table_to_allc_sep = 'character to separate columns in the table'
[docs]table_to_allc_chunk_size = 'chunk_size to perform conversion'
[docs]table_to_allc_chrom = 'the chromosome column number, 0-based index'
[docs]table_to_allc_pos = 'the position column number, 0-based index'
[docs]table_to_allc_strand = 'the strand column number, 0-based index. ' \
'If not provided, will infer automatically based on the fasta_path'
[docs]table_to_allc_context = 'the cytosine context column number, 0-based index. ' \
'If not provided, will inter automatically based on the fasta_path'
[docs]table_to_allc_mc = 'the methylated cytosine count column number, 0-based index.'
[docs]table_to_allc_uc = 'the unmethylated cytosine count column number, 0-based index.'
[docs]table_to_allc_cov = 'the total cytosine coverage count column number, 0-based index.'
[docs]table_to_allc_mc_frac = 'the methylation fraction column number, 0-based index.'
[docs]table_to_allc_pseudo_count = 'Use this pseudo_count number as the total cytosine coverage count, ' \
'if the "cov" column is missing and "mc_frac" column is provided.'
[docs]table_to_allc_fasta_path = 'the genome FASTA file path, ' \
'required if either "strand" or "context" column is missing.'
[docs]table_to_allc_num_upstream_bases = 'number of up stream bases to include when get cytosine context.'
[docs]table_to_allc_num_downstream_bases = 'number of down stream bases to include when get cytosine context.'
[docs]table_to_allc_add_chr = 'whether add "chr" before the chromosome name.'
[docs]table_to_allc_sort = 'whether sort the ALLC table after conversion.'
[docs]def doc_params(**kwds):
"""\
Docstrings should start with "\" in the first line for proper formatting.
"""
def dec(obj):
obj.__doc__ = dedent(obj.__doc__).format(**kwds)
return obj
return dec