Source code for topicpy.hypergeom.hypergeom

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.

import pandas as pd
import numpy as np
from scipy.stats import hypergeom


[docs]def parameters_for_hypergeometric(list_1: pd.Series, list_2: pd.Series) -> (float, float, float, float, (list, list)): """ :param list_1: series :param list_2: series lists of elements :return: - x num of successes - M population size - k successes in population - N sample size - (list_1, list_2) tuple of original lists Example: l1 = pd.Series(index=["ENSG00000000123", "ENSG00000000456", "ENSG00000000789", "ENSG00000000XXX"], data=["c1", "c1", "c1", "c2"], dtype=str) l2 = pd.Series(index=["ENSG00000000123", "ENSG00000000456", "ENSG00000000789"], data=["c1", "c1", "c1"], dtype=str) x, M, k, N, _ = parameters_for_hypergeometric(l1, l2) >>> x c1 c1 3 c2 0 >>> M 3 >>> k {'c1': 3} >>> N {'c1': 3, 'c2': 1} """ population_size = len(list_1[list_1.index.isin(list_2.index)]) pop_successes = {module: len(list_2[list_2 == module]) for module in list_2.unique()} sample_sizes = {topic: len(list_1[list_1 == topic]) for topic in list_1.unique()} num_successes = pd.DataFrame(index=list_1.unique(), columns=list_2.unique()).fillna(0) for g in list_2.index: if g in list_1.index: num_successes.at[list_1[g], list_2[g]] += 1 print(num_successes.shape) return num_successes, population_size, pop_successes, sample_sizes, (list_1, list_2)
[docs]def build_map(num_successes, population_size, pop_successes, sample_sizes, lists, last_name=None): list_1, list_2 = lists df_cmap = pd.DataFrame(index=["Topic %d" % (d + 1) for d in range(len(list_1.unique()))], columns=["Topic %d" % (d + 1) for d in range(len(list_2.unique()))]).fillna(0.5) if last_name is not None: df_cmap.columns = num_successes.columns for module, module_successes in zip(df_cmap.columns, num_successes.columns): for topic, topic_successes in zip(df_cmap.index, num_successes.index): x = num_successes.at[topic_successes, module_successes].astype(int) # number of successes M = population_size # pop size k = pop_successes[module_successes] # successes in pop N = sample_sizes[topic_successes] # sample size pval = hypergeom.sf(x - 1, M, k, N) df_cmap.at[topic, module] = -np.log10(float(pval)) return df_cmap
[docs]def plot_map(df_cmap, first_name="topsbm", last_name="lda", *args, **kwargs): import seaborn as sns import matplotlib.pyplot as plt # df_cmap = df_cmap.sort_values(by=[c for c in df_cmap.columns], axis=0, ascending=True) # create a color palette with the same number of colors as unique values in the Source column network_pal = sns.color_palette('husl', n_colors=len(df_cmap.columns)) # Create a dictionary where the key is the category and the values are the # colors from the palette we just created network_lut = dict(zip(df_cmap.columns, network_pal)) network_col = df_cmap.columns.map(network_lut) # Create a dictionary where the key is the category and the values are the # colors from the palette we just created network_lut = dict(zip(df_cmap.columns, network_pal)) network_col = df_cmap.columns.map(network_lut) cm = sns.clustermap(df_cmap, row_cluster=False, col_cluster=False, metric='euclidean', vmin=0, vmax=30, cmap='Blues_r', col_colors=network_col, mask=False, cbar_pos=(1.05, 0.05, 0.05, 0.7), *args, **kwargs) ax = cm.ax_heatmap ax.tick_params(labelsize=15) ax.set_ylabel(first_name, fontsize=35) ax.set_xlabel(last_name, fontsize=35) ax.set_xticklabels(["Topic %d" % (t + 1) for t, _ in enumerate(df_cmap.columns)], rotation=75) ax.yaxis.tick_left() ax.yaxis.set_label_position("left") ax.set_yticklabels(["Topic %d" % (t + 1) for t, _ in enumerate(df_cmap.index)], rotation=0) cax = cm.ax_cbar cax.tick_params(labelsize=35) cax.set_title("-Log(P-value)", fontsize=30) # plt.tight_layout() cm.fig.suptitle('Algorithm comparison', fontsize=40) cm.savefig(f"topics_logp_{first_name}_{last_name}.pdf") plt.show() return cm