import pathlib
import pandas as pd
[docs]def read_gtf(gtf_path):
gtf = pd.read_csv(gtf_path,
comment='#',
sep='\t',
header=None,
names=[
'chrom', 'source', 'feature', 'start', 'end',
'score', 'strand', 'phase', 'annotation'
])
return gtf
[docs]def subset_gtf(gtf, regions, output_path=None, select_feature=None):
if isinstance(gtf, (str, pathlib.Path)):
gtf = read_gtf(gtf)
if (len(regions) == 3) and isinstance(regions[1], int):
# assume this is a single region
regions = [regions]
if select_feature is not None:
gtf = gtf[gtf['feature'].isin(select_feature)].copy()
use_rows = None
for region in regions:
chrom, start, end = region
judge = (gtf['chrom'] == chrom) & (gtf['start'] < end) & (gtf['end'] > start)
if use_rows is None:
use_rows = judge
else:
use_rows = use_rows | judge
gtf_sub = gtf[use_rows]
if output_path is not None:
gtf_sub.to_csv(output_path, sep='\t', index=None, header=None)
return gtf_sub