import os
import pickle
import logging
import re
from intervaltree import IntervalTree # , Interval
import pandas as pd
from ._transcriptome_io import import_ref_transcripts
from .gene import Gene
from ._transcriptome_filter import DEFAULT_GENE_FILTER, DEFAULT_TRANSCRIPT_FILTER, DEFAULT_REF_TRANSCRIPT_FILTER, ANNOTATION_VOCABULARY, SPLICE_CATEGORY
from . import __version__
logger = logging.getLogger('isotools')
# as this class has diverse functionality, its split among:
# transcriptome.py (this file- initialization and user level basic functions)
# _transcriptome_io.py (input/output primary data files/tables)
# _transcriptome_stats.py (statistical methods)
# _trnascriptome_plots.py (plots)
# _transcriptome_filter.py (gene/transcript iteration and filtering)
[docs]class Transcriptome:
'''Contains sequencing data and annotation for Long Read Transcriptome Sequencing (LRTS) Experiments.
'''
# initialization and save/restore data
def __init__(self, **kwargs):
'''Constructor method'''
if 'data' in kwargs:
self.data = kwargs['data']
self.infos = kwargs.get('infos', {})
self.chimeric = kwargs.get('chimeric', {})
self.filter = kwargs.get('filter', {})
assert 'reference_file' in self.infos
self.make_index()
[docs] @classmethod
def from_reference(cls, reference_file, file_format='auto', **kwargs):
'''Creates a Transcriptome object by importing reference annotation.
:param reference_file: Reference file in gff3 format or pickle file to restore previously imported annotation
:type reference_file: str
:param file_format: Specify the file format of the provided reference_file.
If set to "auto" the file type is infrered from the extension.
:param chromosome: If reference file is gtf/gff, restrict import on specified chromosomes
:param infer_transcrpts: If reference file is gtf, genes and transcripts are infered from "exon" entries, no specific transcript '''
if file_format == 'auto':
file_format = os.path.splitext(reference_file)[1].lstrip('.')
if file_format == 'gz':
file_format = os.path.splitext(reference_file[:-3])[1].lstrip('.')
if file_format in ('gff', 'gff3', 'gtf'):
logger.info('importing reference from %s file %s', file_format, reference_file)
tr = cls()
tr.chimeric = {}
tr.data = import_ref_transcripts(reference_file, tr, file_format, **kwargs)
tr.infos = {'reference_file': reference_file, 'isotools_version': __version__}
tr.filter = {'gene': DEFAULT_GENE_FILTER.copy(),
'transcript': DEFAULT_TRANSCRIPT_FILTER.copy(),
'reference': DEFAULT_REF_TRANSCRIPT_FILTER.copy()}
for subcat in ANNOTATION_VOCABULARY:
tag = '_'.join(re.findall(r'\b\w+\b', subcat)).upper()
if tag[0].isdigit():
tag = '_'+tag
tr.filter['transcript'][tag] = f'"{subcat}" in annotation[1]'
for i, cat in enumerate(SPLICE_CATEGORY):
tr.filter['transcript'][cat] = f'annotation[0]=={i}'
elif file_format == 'pkl':
# warn if kwargs are specified: kwargs are ignored
if kwargs:
logger.warning("The following parameters are ignored when loading reference from pkl: %s", ", ".join(kwargs))
tr = cls.load(reference_file)
if 'sample_table' in tr.infos:
logger.warning('the pickle file seems to contain sample information... extracting refrence')
tr = tr._extract_reference()
else:
raise ValueError('invalid file format %s of file %s' % (file_format, reference_file))
tr.make_index()
return tr
[docs] def save(self, pickle_file=None):
'''Saves transcriptome information (including reference) in a pickle file.
:param pickle_file: Filename to save data'''
if pickle_file is None:
pickle_file = self.infos['out_file_name']+'.isotools.pkl' # key error if not set
logger.info('saving transcriptome to %s', pickle_file)
pickle.dump(self, open(pickle_file, 'wb'))
[docs] @classmethod
def load(cls, pickle_file):
'''Restores transcriptome information from a pickle file.
:param pickle_file: Filename to restore data'''
logger.info('loading transcriptome from %s', pickle_file)
tr = pickle.load(open(pickle_file, 'rb'))
pickled_version = tr.infos.get('isotools_version', '<0.2.6')
if pickled_version != __version__:
logger.warning('This is isotools version %s, but data has been pickled with version %s, which may be incompatible', __version__, pickled_version)
tr.make_index()
return tr
[docs] def save_reference(self, pickle_file=None):
'''Saves the reference information of a transcriptome in a pickle file.
:param pickle_file: Filename to save data'''
if pickle_file is None:
pickle_file = self.infos['reference_file']+'.isotools.pkl'
logger.info('saving reference to %s', pickle_file)
ref_tr = self._extract_reference()
pickle.dump(ref_tr, open(pickle_file, 'wb'))
def _extract_reference(self):
if not 'sample_table' not in self.infos:
return self # only reference info - assume that self.data only contains reference data
# make a new transcriptome
ref_info = {k: v for k, v in self.infos.items() if k in ['reference_file', 'isotools_version']}
ref_tr = type(self)(data={}, infos=ref_info, filter=self.filter)
# extract the reference genes and link them to the new ref_tr
keep = {'ID', 'chr', 'strand', 'name', 'reference'} # no coverage, segment_graph, transcripts
for chrom, tree in self.data.items():
ref_tr.data[chrom] = IntervalTree(Gene(g.start, g.end, {k: v
for k, v in g.data.items() if k in keep}, ref_tr) for g in tree if g.is_annotated)
ref_tr.make_index()
return ref_tr
[docs] def make_index(self):
'''Updates the index of gene names and ids (e.g. used by the the [] operator).'''
idx = dict()
for g in self:
if g.id in idx: # at least id should be unique - maybe raise exception?
logger.warning('%s seems to be ambigous: %s vs %s', g.id, str(idx[g.id]), str(g))
idx[g.name] = g
idx[g.id] = g
self._idx = idx
# basic user level functionality
[docs] def __getitem__(self, key):
'''
Syntax: self[key]
:param key: May either be the gene name or the gene id
:return: The gene specified by key.'''
return self._idx[key]
[docs] def __len__(self):
'''Syntax: len(self)
:return: The number of genes.'''
return self.n_genes
[docs] def __contains__(self, key):
''' Syntax: key in self
Checks whether key is in self.
:param key: May either be the gene name or the gene id'''
return key in self._idx
[docs] def remove_chromosome(self, chromosome):
'''Deletes the chromosome from the transcriptome
:param chromosome: Name of the chromosome to remove'''
del self.data[chromosome]
self.make_index()
def _get_sample_idx(self, name_column='name'):
'a dict with group names as keys and index lists as values'
return {sa: i for i, sa in enumerate(self.sample_table[name_column])}
@property
def sample_table(self):
'''The sample table contains sample names, group information, long read coverage, as well as all other potentially
relevant information on the samples.'''
try:
return self.infos['sample_table']
except KeyError:
return pd.DataFrame(columns=['name', 'file', 'group', 'nonchimeric_reads', 'chimeric_reads'], dtype='object')
@property
def samples(self) -> list:
'''An ordered list of sample names.'''
return list(self.sample_table.name)
[docs] def groups(self, by='group') -> dict:
'''Get sample groups as defined in columns of the sample table.
:param by: A column name of the sample table that defines the grouping.
:return: Dict with groupnames as keys and list of sample names as values.
'''
return dict(self.sample_table.groupby(by)['name'].apply(list))
@property
def n_transcripts(self) -> int:
'''The total number of transcripts isoforms.'''
if self.data is None:
return 0
return sum(g.n_transcripts for g in self)
@property
def n_genes(self) -> int:
'''The total number of genes.'''
if self.data is None:
return 0
return sum((len(t) for t in self.data.values()))
@property
def novel_genes(self) -> int: # this is used for id assignment
'''The total number of novel (not reference) genes.'''
try:
return self.infos['novel_counter']
except KeyError:
self.infos['novel_counter'] = 0
return 0
@property
def chromosomes(self) -> list:
'''The list of chromosome names.'''
return list(self.data)
def _add_novel_gene(self, chrom, start, end, strand, info, novel_prefix='PB_novel_'):
n_novel = self.novel_genes
info.update({'chr': chrom, 'ID': f'{novel_prefix}{n_novel+1:05d}', 'strand': strand})
g = Gene(start, end, info, self)
self.data.setdefault(chrom, IntervalTree()).add(g)
self.infos['novel_counter'] += 1
return g
def __str__(self):
return '{} object with {} genes and {} transcripts'.format(type(self).__name__, self.n_genes, self.n_transcripts)
def __repr__(self):
return object.__repr__(self)
def __iter__(self):
return (gene for tree in self.data.values() for gene in tree)
# IO: load new data from primary data files
from ._transcriptome_io import add_sample_from_bam, add_sample_from_csv, remove_samples, add_short_read_coverage, \
remove_short_read_coverage, collapse_immune_genes
# IO: output data as tables or other human readable format
from ._transcriptome_io import gene_table, transcript_table, chimeric_table, write_gtf, export_alternative_splicing
# filtering functionality and iterators
from ._transcriptome_filter import add_qc_metrics, add_filter, remove_filter, iter_genes, iter_transcripts, iter_ref_transcripts
# statistic: differential splicing, alternative_splicing_events
from ._transcriptome_stats import die_test, altsplice_test, coordination_test, alternative_splicing_events, rarefaction
# statistic: summary tables (can be used as input to plot_bar / plot_dist)
from ._transcriptome_stats import altsplice_stats, filter_stats, transcript_length_hist, transcript_coverage_hist,\
transcripts_per_gene_hist, exons_per_transcript_hist, downstream_a_hist, direct_repeat_hist
# protein domain annotation
from .domains import add_hmmer_domains, add_annotation_domains