Source code for topicpy.geneontology.geneontology

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.

import sys

import gseapy as gs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from topicpy.tableanalyser import get_symbol


[docs]def get_ontology_df(topic, cutoff=0.05, threshhold=5e-1, gene_sets = ['GO_Molecular_Function_2018', 'GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'Human_Phenotype_Ontology', 'GTEx_Tissue_Sample_Gene_Expression_Profiles_up', 'GTEx_Tissue_Sample_Gene_Expression_Profiles_down', 'Tissue_Protein_Expression_from_Human_Proteome_Map', 'KEGG_2019_Human', 'NCI-60_Cancer_Cell_Lines' ], background=None) -> pd.DataFrame: """ :param topic: list of genes :param background: enrichment test background :param cutoff: Enrichments cutoff :param threshhold: threshold on Adjusted P-value :return: DataFrame with terms and P-vals """ sets = ','.join(gene_sets) if background is None: background='hsapiens_gene_ensembl' topic = [g for g in topic if str(g)!='nan'] gene_ontology = gs.enrichr(list(topic), gene_sets=sets, cutoff=cutoff, background=background).results return gene_ontology[gene_ontology['Adjusted P-value'] < threshhold][['Term', 'Adjusted P-value', 'Gene_set']]
def ensg_to_symbol(series): """ convert gene ENSG to Symbol :param series: series of ENSGs :return: symbols """ symbols = [] for g in series: try: symbols.append(get_symbol(g)) except: print(*sys.exc_info()) symbols = np.array(symbols) return symbols[[s!='nan' for s in symbols]]
[docs]def save_plot_Pvalues(df_topics, l, directory, algorithm): """ :param df_topics: Topics DataFrame :param l: level of the analysis :param directory: """ topic_pvalues = [] topic_gos = [] for itopic, topic in enumerate(df_topics.columns): try: enriched_topic = pd.read_csv("%s/%s/gsea_level_%d_topic_%d.csv" % (directory, algorithm, l, itopic+1)) if len(enriched_topic.index) > 0: p_val = np.sort(enriched_topic['Adjusted P-value'])[0] topic_pvalues.append(-np.log10(p_val)) for goc in enriched_topic['Gene_set'][:10].unique(): topic_gos.append(goc) except: print(*sys.exc_info()) fig = plt.figure() #x = np.arange(1, 1 + len(topic_pvalues)) c, _, _ = plt.hist(topic_pvalues, histtype='step', lw=2) plt.plot([-np.log10(0.05) for _ in np.linspace(1, 10, num=10)], np.arange(0, np.max(c) + 5, (np.max(c) + 5) / 10), ls='--', lw=5, label="$\\alpha=0.05$") plt.xlabel('-log(P-value)', fontsize=20) plt.ylabel("number of topics", fontsize=20) # plt.ylim(0,0.055) # plt.yscale('log') plt.legend(fontsize=18) fig.savefig("%s/pvaluescrosstopic(%d).pdf" % (directory, l)) fig = plt.figure(figsize=(20, 10)) gos, goscounts = np.unique(topic_gos, return_counts=True) from textwrap import wrap plt.barh(["\n".join(wrap(str(l).replace('_', ' '), 20)) for l in gos], goscounts) plt.yticks(fontsize=15) plt.show() fig.savefig("%s/pvaluecategories(%d).pdf" % (directory, l))
[docs]def topic_analysis(directory, l, algorithm="topsbm", background=None, verbose=True, save_Pvalues=True): """ :param directory: where to find files :param l: level of the analisys :param algorithm: name of folder and files containing topics table (e.g. path/to/topsbm/topsbm_level_3_topics.csv) :param background: List of background genes :param verbose: verbosity :param save_Pvalues: save data for P-values plot """ df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, l)) enriched_topic = None for itopic, topic in enumerate(df_topics.columns): try: enriched_topic = pd.read_csv("%s/%s/gsea_level_%d_topic_%d.csv" % (directory, algorithm, l, itopic+1), index_col=[0]) print(topic) except: try: symbols = ensg_to_symbol(df_topics.loc[:, topic].dropna().values) print(topic) enriched_topic = get_ontology_df(symbols, background=background).sort_values(by=['Adjusted P-value'], ascending=True) enriched_topic = enriched_topic.loc[enriched_topic.index.values[:20], :] enriched_topic.to_csv("%s/%s/gsea_level_%d_topic_%d.csv" % (directory, algorithm, l, itopic+1)) except: print(*sys.exc_info()) if verbose: print(enriched_topic) if save_Pvalues: save_plot_Pvalues(df_topics, l, directory, algorithm)