Source code for isotools.transcriptome

import os
import pickle
import logging
import re
from intervaltree import IntervalTree  # , Interval
import pandas as pd
from ._transcriptome_io import import_ref_transcripts
from .gene import Gene
from ._transcriptome_filter import DEFAULT_GENE_FILTER, DEFAULT_TRANSCRIPT_FILTER, DEFAULT_REF_TRANSCRIPT_FILTER, ANNOTATION_VOCABULARY, SPLICE_CATEGORY
from . import __version__
logger = logging.getLogger('isotools')

# as this class has diverse functionality, its split among:
# transcriptome.py (this file- initialization and user level basic functions)
# _transcriptome_io.py (input/output primary data files/tables)
# _transcriptome_stats.py (statistical methods)
# _trnascriptome_plots.py (plots)
# _transcriptome_filter.py (gene/transcript iteration and filtering)


[docs]class Transcriptome:
    '''Contains sequencing data and annotation for Long Read Transcriptome Sequencing (LRTS) Experiments.
    '''
    # initialization and save/restore data

    def __init__(self, **kwargs):
        '''Constructor method'''
        if 'data' in kwargs:
            self.data = kwargs['data']
            self.infos = kwargs.get('infos', {})
            self.chimeric = kwargs.get('chimeric', {})
            self.filter = kwargs.get('filter', {})
            assert 'reference_file' in self.infos
            self.make_index()

[docs]    @classmethod
    def from_reference(cls, reference_file, file_format='auto', **kwargs):
        '''Creates a Transcriptome object by importing reference annotation.

        :param reference_file: Reference file in gff3 format or pickle file to restore previously imported annotation
        :type reference_file: str
        :param file_format: Specify the file format of the provided reference_file.
            If set to "auto" the file type is infrered from the extension.
        :param chromosome: If reference file is gtf/gff, restrict import on specified chromosomes
        :param infer_transcrpts: If reference file is gtf, genes and transcripts are infered from "exon" entries, no specific transcript '''

        if file_format == 'auto':
            file_format = os.path.splitext(reference_file)[1].lstrip('.')
            if file_format == 'gz':
                file_format = os.path.splitext(reference_file[:-3])[1].lstrip('.')
        if file_format in ('gff', 'gff3', 'gtf'):
            logger.info('importing reference from %s file %s', file_format, reference_file)
            tr = cls()
            tr.chimeric = {}
            tr.data = import_ref_transcripts(reference_file, tr, file_format,  **kwargs)
            tr.infos = {'reference_file': reference_file, 'isotools_version': __version__}
            tr.filter = {'gene': DEFAULT_GENE_FILTER.copy(),
                         'transcript': DEFAULT_TRANSCRIPT_FILTER.copy(),
                         'reference': DEFAULT_REF_TRANSCRIPT_FILTER.copy()}
            for subcat in ANNOTATION_VOCABULARY:
                tag = '_'.join(re.findall(r'\b\w+\b', subcat)).upper()
                if tag[0].isdigit():
                    tag = '_'+tag
                tr.filter['transcript'][tag] = f'"{subcat}" in annotation[1]'
            for i, cat in enumerate(SPLICE_CATEGORY):
                tr.filter['transcript'][cat] = f'annotation[0]=={i}'

        elif file_format == 'pkl':
            # warn if kwargs are specified: kwargs are ignored
            if kwargs:
                logger.warning("The following parameters are ignored when loading reference from pkl: %s", ", ".join(kwargs))
            tr = cls.load(reference_file)
            if 'sample_table' in tr.infos:
                logger.warning('the pickle file seems to contain sample information... extracting refrence')
                tr = tr._extract_reference()
        else:
            raise ValueError('invalid file format %s of file %s' % (file_format, reference_file))
        tr.make_index()
        return tr

[docs]    def save(self, pickle_file=None):
        '''Saves transcriptome information (including reference) in a pickle file.

        :param pickle_file: Filename to save data'''
        if pickle_file is None:
            pickle_file = self.infos['out_file_name']+'.isotools.pkl'  # key error if not set
        logger.info('saving transcriptome to %s', pickle_file)
        pickle.dump(self, open(pickle_file, 'wb'))

[docs]    @classmethod
    def load(cls, pickle_file):
        '''Restores transcriptome information from a pickle file.

        :param pickle_file: Filename to restore data'''

        logger.info('loading transcriptome from %s', pickle_file)
        tr = pickle.load(open(pickle_file, 'rb'))
        pickled_version = tr.infos.get('isotools_version', '<0.2.6')
        if pickled_version != __version__:
            logger.warning('This is isotools version %s, but data has been pickled with version %s, which may be incompatible', __version__, pickled_version)
        tr.make_index()
        return tr

[docs]    def save_reference(self, pickle_file=None):
        '''Saves the reference information of a transcriptome in a pickle file.

        :param pickle_file: Filename to save data'''
        if pickle_file is None:
            pickle_file = self.infos['reference_file']+'.isotools.pkl'
        logger.info('saving reference to %s', pickle_file)
        ref_tr = self._extract_reference()
        pickle.dump(ref_tr, open(pickle_file, 'wb'))

    def _extract_reference(self):
        if not 'sample_table' not in self.infos:
            return self  # only reference info - assume that self.data only contains reference data
        # make a new transcriptome
        ref_info = {k: v for k, v in self.infos.items() if k in ['reference_file', 'isotools_version']}
        ref_tr = type(self)(data={}, infos=ref_info, filter=self.filter)
        # extract the reference genes and link them to the new ref_tr
        keep = {'ID', 'chr', 'strand', 'name', 'reference'}  # no coverage, segment_graph, transcripts
        for chrom, tree in self.data.items():
            ref_tr.data[chrom] = IntervalTree(Gene(g.start, g.end, {k: v
                                              for k, v in g.data.items() if k in keep}, ref_tr) for g in tree if g.is_annotated)
        ref_tr.make_index()
        return ref_tr

[docs]    def make_index(self):
        '''Updates the index of gene names and ids (e.g. used by the the [] operator).'''
        idx = dict()
        for g in self:
            if g.id in idx:  # at least id should be unique - maybe raise exception?
                logger.warning('%s seems to be ambigous: %s vs %s', g.id, str(idx[g.id]), str(g))
            idx[g.name] = g
            idx[g.id] = g
        self._idx = idx

    # basic user level functionality
[docs]    def __getitem__(self, key):
        '''
        Syntax: self[key]

        :param key: May either be the gene name or the gene id
        :return: The gene specified by key.'''
        return self._idx[key]

[docs]    def __len__(self):
        '''Syntax: len(self)

        :return: The number of genes.'''
        return self.n_genes

[docs]    def __contains__(self, key):
        ''' Syntax: key in self

        Checks whether key is in self.

        :param key: May either be the gene name or the gene id'''
        return key in self._idx

[docs]    def remove_chromosome(self, chromosome):
        '''Deletes the chromosome from the transcriptome

        :param chromosome: Name of the chromosome to remove'''
        del self.data[chromosome]
        self.make_index()

    def _get_sample_idx(self, name_column='name'):
        'a dict with group names as keys and index lists as values'
        return {sa: i for i, sa in enumerate(self.sample_table[name_column])}

    @property
    def sample_table(self):
        '''The sample table contains sample names, group information, long read coverage, as well as all other potentially
        relevant information on the samples.'''
        try:
            return self.infos['sample_table']
        except KeyError:
            return pd.DataFrame(columns=['name', 'file', 'group', 'nonchimeric_reads', 'chimeric_reads'], dtype='object')

    @property
    def samples(self) -> list:
        '''An ordered list of sample names.'''
        return list(self.sample_table.name)

[docs]    def groups(self, by='group') -> dict:
        '''Get sample groups as defined in columns of the sample table.

        :param by: A column name of the sample table that defines the grouping.
        :return: Dict with groupnames as keys and list of sample names as values.
        '''
        return dict(self.sample_table.groupby(by)['name'].apply(list))

    @property
    def n_transcripts(self) -> int:
        '''The total number of transcripts isoforms.'''
        if self.data is None:
            return 0
        return sum(g.n_transcripts for g in self)

    @property
    def n_genes(self) -> int:
        '''The total number of genes.'''
        if self.data is None:
            return 0
        return sum((len(t) for t in self.data.values()))

    @property
    def novel_genes(self) -> int:  # this is used for id assignment
        '''The total number of novel (not reference) genes.'''
        try:
            return self.infos['novel_counter']
        except KeyError:
            self.infos['novel_counter'] = 0
            return 0

    @property
    def chromosomes(self) -> list:
        '''The list of chromosome names.'''
        return list(self.data)

    def _add_novel_gene(self, chrom, start, end, strand, info, novel_prefix='PB_novel_'):
        n_novel = self.novel_genes
        info.update({'chr': chrom, 'ID': f'{novel_prefix}{n_novel+1:05d}', 'strand': strand})
        g = Gene(start, end, info, self)
        self.data.setdefault(chrom, IntervalTree()).add(g)
        self.infos['novel_counter'] += 1
        return g

    def __str__(self):
        return '{} object with {} genes and {} transcripts'.format(type(self).__name__, self.n_genes, self.n_transcripts)

    def __repr__(self):
        return object.__repr__(self)

    def __iter__(self):
        return (gene for tree in self.data.values() for gene in tree)

    # IO: load new data from primary data files
    from ._transcriptome_io import add_sample_from_bam, add_sample_from_csv, remove_samples, add_short_read_coverage, \
        remove_short_read_coverage, collapse_immune_genes

    # IO: output data as tables or other human readable format
    from ._transcriptome_io import gene_table, transcript_table, chimeric_table, write_gtf, export_alternative_splicing

    # filtering functionality and iterators
    from ._transcriptome_filter import add_qc_metrics, add_filter, remove_filter, iter_genes, iter_transcripts, iter_ref_transcripts

    # statistic: differential splicing, alternative_splicing_events
    from ._transcriptome_stats import die_test, altsplice_test, coordination_test, alternative_splicing_events, rarefaction

    # statistic: summary tables (can be used as input to plot_bar / plot_dist)
    from ._transcriptome_stats import altsplice_stats, filter_stats, transcript_length_hist, transcript_coverage_hist,\
        transcripts_per_gene_hist, exons_per_transcript_hist, downstream_a_hist, direct_repeat_hist

    # protein domain annotation
    from .domains import add_hmmer_domains, add_annotation_domains