Source code for topicpy.hypergeom.hypergeom

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.

import pandas as pd
import numpy as np
from scipy.stats import hypergeom


[docs]def parameters_for_hypergeometric(list_1: pd.Series, list_2: pd.Series) -> (float, float, float, float, (list, list)):
    """
    :param list_1: series
    :param list_2: series

    lists of elements

    :return:
    - x num of successes
    - M population size
    - k successes in population
    - N sample size
    - (list_1, list_2) tuple of original lists

    Example:

    l1 = pd.Series(index=["ENSG00000000123", "ENSG00000000456", "ENSG00000000789", "ENSG00000000XXX"], data=["c1", "c1", "c1", "c2"], dtype=str)
    l2 = pd.Series(index=["ENSG00000000123", "ENSG00000000456", "ENSG00000000789"], data=["c1", "c1", "c1"], dtype=str)
    x, M, k, N, _ = parameters_for_hypergeometric(l1, l2)

    >>> x
        c1
    c1   3
    c2   0
    >>> M
    3
    >>> k
    {'c1': 3}
    >>> N
    {'c1': 3, 'c2': 1}
    """
    population_size = len(list_1[list_1.index.isin(list_2.index)])
    pop_successes = {module: len(list_2[list_2 == module]) for module in list_2.unique()}
    sample_sizes = {topic: len(list_1[list_1 == topic]) for topic in list_1.unique()}
    num_successes = pd.DataFrame(index=list_1.unique(), columns=list_2.unique()).fillna(0)
    for g in list_2.index:
        if g in list_1.index:
            num_successes.at[list_1[g], list_2[g]] += 1
    print(num_successes.shape)
    return num_successes, population_size, pop_successes, sample_sizes, (list_1, list_2)


[docs]def build_map(num_successes, population_size, pop_successes, sample_sizes, lists, last_name=None):
    list_1, list_2 = lists
    df_cmap = pd.DataFrame(index=["Topic %d" % (d + 1) for d in range(len(list_1.unique()))],
                           columns=["Topic %d" % (d + 1) for d in range(len(list_2.unique()))]).fillna(0.5)
    if last_name is not None:
        df_cmap.columns = num_successes.columns
    for module, module_successes in zip(df_cmap.columns, num_successes.columns):
        for topic, topic_successes in zip(df_cmap.index, num_successes.index):
            x = num_successes.at[topic_successes, module_successes].astype(int)  # number of successes
            M = population_size  # pop size
            k = pop_successes[module_successes]  # successes in pop
            N = sample_sizes[topic_successes]  # sample size
            pval = hypergeom.sf(x - 1, M, k, N)
            df_cmap.at[topic, module] = -np.log10(float(pval))
    return df_cmap


[docs]def plot_map(df_cmap, first_name="topsbm", last_name="lda", *args, **kwargs):
    import seaborn as sns
    import matplotlib.pyplot as plt

    # df_cmap = df_cmap.sort_values(by=[c for c in df_cmap.columns], axis=0, ascending=True)
    # create a color palette with the same number of colors as unique values in the Source column
    network_pal = sns.color_palette('husl', n_colors=len(df_cmap.columns))

    # Create a dictionary where the key is the category and the values are the
    # colors from the palette we just created
    network_lut = dict(zip(df_cmap.columns, network_pal))
    network_col = df_cmap.columns.map(network_lut)
    # Create a dictionary where the key is the category and the values are the
    # colors from the palette we just created
    network_lut = dict(zip(df_cmap.columns, network_pal))
    network_col = df_cmap.columns.map(network_lut)

    cm = sns.clustermap(df_cmap,
                        row_cluster=False,
                        col_cluster=False,
                        metric='euclidean',
                        vmin=0,
                        vmax=30,
                        cmap='Blues_r',
                        col_colors=network_col,
                        mask=False,
                        cbar_pos=(1.05, 0.05, 0.05, 0.7),
                        *args,
                        **kwargs)

    ax = cm.ax_heatmap
    ax.tick_params(labelsize=15)

    ax.set_ylabel(first_name, fontsize=35)
    ax.set_xlabel(last_name, fontsize=35)
    ax.set_xticklabels(["Topic %d" % (t + 1) for t, _ in enumerate(df_cmap.columns)], rotation=75)
    ax.yaxis.tick_left()
    ax.yaxis.set_label_position("left")
    ax.set_yticklabels(["Topic %d" % (t + 1) for t, _ in enumerate(df_cmap.index)], rotation=0)

    cax = cm.ax_cbar
    cax.tick_params(labelsize=35)
    cax.set_title("-Log(P-value)", fontsize=30)

    # plt.tight_layout()

    cm.fig.suptitle('Algorithm comparison', fontsize=40)
    cm.savefig(f"topics_logp_{first_name}_{last_name}.pdf")
    plt.show()
    return cm