Source code for topicpy.hsbmpy.hsbmpy

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.
import os

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
import sys
import seaborn as sns

sns.set()
sns.set_context("paper")
from sklearn import metrics

[docs]class painter():
    def __init__(self):
        """
        Painter iteator over list of colors

        :return : painter

        Usage
        =====
        p = painter()
        next(p)
        """
        # get colors from https://medialab.github.io/iwanthue/ or artenatevly from http://phrogz.net/css/distinct-colors.html
        self.colors_cycle = ["#a257d4",
                            "#e090bf",
                            "#64c9a3",
                            "#4b68ae",
                            "#dc8c2f",
                            "#cd41a7",
                            "#d9344f",
                            "#bc599a",
                            "#afa1e8",
                            "#48c1d8",
                            "#b54545",
                            "#919233",
                            "#9a78be",
                            "#59602a",
                            "#4e8e2c",
                            "#9db935",
                            "#9b563c",
                            "#e482df",
                            "#5995d3",
                            "#6a5198",
                            "#b05f84",
                            "#b563c3",
                            "#5f6b18",
                            "#a55c21",
                            "#5754c2",
                            "#277257",
                            "#4f9b5e",
                            "#8b6b29",
                            "#b8381c",
                            "#ad2f62",
                            "#97ba6d",
                            "#45c37c",
                            "#5fc250",
                            "#8c4c7b",
                            "#e06e87",
                            "#e2672a",
                            "#db7756",
                            "#974858",
                            "#35743b",
                            "#bbaf6c",
                            "#8c4099",
                            "#e44586",
                            "#ed5c4c",
                            "#389c84",
                            "#cfae3d",
                            "#eda377",
                            "#778749",
                            "#c5935a",
                            "#de8784",
                            "#757eec"]
        self.available_colors = len(self.colors_cycle)
        self._index = -1

    def __iter__(self):
        self._index = -1
        return self

    def __next__(self):
        self._index += 1
        if self._index >= self.available_colors:
            self._index = 0
        return self.colors_cycle[self._index]


color_iterator = painter()



def plot_cluster_composition(fraction_sites, directory, level, normalise=False, label='primary_site', shuffled=False,
                             algorithm='topsbm'):
    sns.set(font_scale=0.8)
    df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv" % (directory, algorithm, algorithm, level), header=[0])
    x = np.arange(1, 1 + len(df_clusters.columns))
    fig = plt.figure(figsize=(25, 15))
    ax = fig.subplots()
    fraction_bar_plot(x, fraction_sites, ax)
    ax.set_xlabel("cluster", fontsize=35)
    if normalise:
        ax.set_ylabel("fraction of nodes", fontsize=35)
    else:
        ax.set_ylabel("number of nodes", fontsize=35)
    ax.set_title("%s%s distribution across clusters" % ("Shuffled " if shuffled else '', label), fontsize=35)
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    # Put a legend to the right of the current axis
    n_labels = len(fraction_sites)
    n_col = round(n_labels / 20) if n_labels > 20 else 1
    ax.legend(loc='best', bbox_to_anchor=(1, 0.99), fontsize=35, ncol=n_col)
    ax.tick_params(axis='both', labelsize=35)
    plt.show()
    fig.savefig("%s/%s/%s%sclustercomposition_l%d_%s.pdf" % (
        directory, algorithm, "shuffled" if shuffled else '', "fraction_" if normalise else '', int(level), label))


def fraction_bar_plot(x, fraction_sites, ax=None):
    global current_color
    current_color = -1
    if ax is None:
        fig = plt.figure(figsize=(15, 8))
        ax = fig.subplots()
    bottom = np.zeros(len(x))
    color_iterator = painter()
    for site, data in fraction_sites.items():
        if np.max(data) == 0:
            continue
        ax.bar(x, data, label=site, bottom=bottom, color=next(color_iterator))
        bottom = bottom + data


def get_Palette(site):
    palette_map = dict({'Brain': 'Blues',
                        'Breast': 'Reds',
                        'Kidney': 'Greens',
                        'Lung': 'Oranges',
                        'Thyroid': 'Greys',
                        'Uterus': 'Purples',
                        'Prostate': 'BuGn',
                        'Ovary': 'BuPu',
                        'Lymph Nodes': 'OrRd',
                        'Soft Tissue': 'PuRd',
                        'Esophagus': 'YlGn',
                        'Stomach': 'YlRd',
                        'Bone Marrow': 'PuBuGn',
                        'Skin': 'YlOrRd',
                        'Adipose Tissue': 'YlOrBr',
                        'Blood': 'RdPu',
                        'Pancreas': 'OrRd',
                        'Testis': 'GnBu'})
    for k in palette_map.keys():
        if k in site:
            return palette_map[k]


def get_cluster_given_l(l, directory, algorithm='topsbm'):
    df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv" % (directory, algorithm, algorithm, l), header=[0], index_col=None)
    cluster = {}
    for i, c in enumerate(df_clusters.columns):
        cluster[i] = df_clusters[c].dropna().values
    return cluster


def get_topic_given_l(l, directory, algorithm='topsbm'):
    df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, l), header=[0])
    topic = {}
    for i, c in enumerate(df_topics.columns):
        topic[i] = df_topics[c].dropna().values
    return topic


def get_fraction_sites(cluster, df_files, label='primary_site', normalise=False):
    fraction_sites = {}
    c_fraction_site = {}
    for site in np.concatenate([df_files[label].dropna().unique(), ["unknown"]]):
        fraction_sites[site] = []
        c_fraction_site[site] = 0

    for i, c in enumerate(cluster):
        for sample in cluster[i]:
            foundsample = get_file(sample, df_files)
            if foundsample is not None:
                c_fraction_site[foundsample[label]] += 1
            else:
                c_fraction_site['unknown'] += 1

        for site in fraction_sites.keys():
            if normalise:
                norm = float(len(cluster[i]))
            else:
                norm = 1
            if norm > 0:
                fraction_sites[site].append(c_fraction_site[site] / norm)
            else:
                fraction_sites[site].append(np.nan)
            c_fraction_site[site] = 0
    df = pd.DataFrame(data=fraction_sites).dropna(how='all', axis=0)
    ##put first columns that have high values in average
    avgs = df.apply(lambda x: np.average(
        x.to_numpy()[x.to_numpy().nonzero()[0]]), axis=0)
    df = df.transpose()
    df.insert(0, 'avg', avgs)
    df = df.sort_values(by=['avg'], axis=0, ascending=False).drop(
        'avg', axis=1).transpose()
    df = df.sort_values(
        by=[tissue for tissue in df.columns], axis=0, ascending=False)
    return df.sort_index(1).to_dict(orient='list')


def get_clustersinfo(cluster, fraction_sites):
    clustersinfo = {
        "maximum": [],
        "homogeneity": [],
        "sizes": [],
        "nclasses": []
    }
    for icluster in cluster:
        maximum = 0
        homo = 0
        size = 0
        nclass = 0
        site_maximum = ''
        cumulative = 0
        for site, data in fraction_sites.items():
            cdata = data[icluster]
            cumulative += cdata
            if cdata > maximum:
                maximum = cdata
                site_maximum = site
            if cdata > 0:
                nclass += 1
                # using fraction_items normalised
                if cdata <= 1:
                    homo -= cdata * np.log(cdata)
            size += cdata
        if cumulative > 0:
            clustersinfo['maximum'].append([float(maximum) / cumulative, site_maximum])
        else:
            clustersinfo['maximum'].append([0, site_maximum])
        clustersinfo['sizes'].append(size)
        clustersinfo['nclasses'].append(nclass)
        clustersinfo['homogeneity'].append(1 - homo)
    return clustersinfo


def plot_maximum(clustersinfo, cluster, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'):
    fig = plt.figure(figsize=(15, 6))
    ax = fig.subplots(1, 2)
    bins = 10
    real = np.array(clustersinfo['maximum'])[:, 0].astype(float)
    ax[0].plot(np.sort(real), marker='o', ms=25, ls='')
    ax[1].hist(np.sort(real), histtype='step', bins=bins, lw=4, density=True, range=(0.05, 1.05))
    shuffled = False
    if clustersinfo_shuffle is not None:
        shuffled = np.array(clustersinfo_shuffle['maximum'])[:, 0].astype(float)
        ax[0].plot(np.sort(shuffled), marker='o', ls='', ms=25)
        ax[1].hist(np.sort(shuffled), histtype='step', bins=bins, lw=4, density=True, range=(0.05, 1.05))
        shuffled = True
    ax[0].plot(np.arange(len(cluster)), [0.8 for i in range(len(cluster))], visible=True, ls='--')
    for axi in ax:
        axi.tick_params(axis='both', labelsize=20)
    ax[0].set_xlabel("cluster", fontsize=35)
    ax[0].set_ylabel("maximum fraction\nwith same %s" % label, fontsize=35)
    ax[0].set_ylim((0, 1.1))
    ax[1].set_xlabel("maximum fraction\nwith same %s" % label, fontsize=35)
    ax[1].set_ylabel("pdf", fontsize=35)
    plt.rc('xtick', labelsize=18)
    plt.rc('ytick', labelsize=18)
    plt.show()
    fig.savefig(
        "%s/%s/%scluster_maximum_l%d_%s.pdf" % (directory, algorithm, "shuffled" if shuffled else '', level, label))


def plot_maximum_size(clustersinfo, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'):
    fig = plt.figure(figsize=(15, 6))
    x = np.array(clustersinfo['sizes']).astype(int)
    y = np.array(clustersinfo['maximum'])[:, 0].astype(float)
    plt.scatter(x, y, lw=10, label='clusters')
    plt.xlim(0, np.max(x) + np.max(x) / 10)
    plt.plot(np.linspace(0.5, x.max()), 1. / np.linspace(0.5, x.max()), label='uniform')
    shuffled = False
    if clustersinfo_shuffle is not None:
        shuffled = True
        x_shuffle = np.array(clustersinfo_shuffle['sizes']).astype(int)
        y_shuffle = np.array(clustersinfo_shuffle['maximum'])[:, 0].astype(float)
        plt.scatter(x_shuffle, y_shuffle, lw=10, label='clusters shuffled')
        plt.xlim(0, np.max(x_shuffle) + np.max(x_shuffle) / 10)
    plt.xlabel("cluster size", fontsize=35)
    plt.ylabel("maximum fraction\nwith same %s" % label, fontsize=35)
    plt.ylim((0, 1.1))
    plt.legend(loc='best', fontsize=35)
    plt.rc('xtick', labelsize=18)
    plt.rc('ytick', labelsize=18)
    plt.show()
    fig.savefig(
        "%s/%s/%sclusterhomosize_l%d_%s.pdf" % (directory, algorithm, "shuffled" if shuffled else '', level, label))


def plot_maximum_label(clustersinfo, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'):
    fig = plt.figure(figsize=(10, 6))
    x = np.array(clustersinfo['nclasses']).astype(int)
    y = np.array(clustersinfo['maximum'])[:, 0].astype(float)
    shuffled = False
    plt.scatter(x, y, lw=10, alpha=0.9, label='clusters')
    plt.plot(np.arange(1, np.max(x) + 2), 1. / np.arange(1, np.max(x) + 2), ls='--', c='cyan', label='uniform')
    plt.xlim(0.95, np.max(x) + 0.5)
    if clustersinfo_shuffle is not None:
        x_shuffle = np.array(clustersinfo_shuffle['nclasses']).astype(int)
        y_shuffle = np.array(clustersinfo_shuffle['maximum'])[:, 0].astype(float)
        plt.scatter(x_shuffle, y_shuffle, lw=10, alpha=0.9, label='clusters shuffled')
        plt.plot(np.arange(1, np.max(x_shuffle) + 2), 1. / np.arange(1, np.max(x_shuffle) + 2), ls='--', c='cyan',
                 label='')
        shuffled = True
        plt.xlim(0.95, np.max(x_shuffle) + 0.5)
    plt.xlabel("number of labels", fontsize=35)
    plt.ylabel("maximum fraction\nwith same %s" % label, fontsize=35)
    plt.ylim((0, 1.1))
    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=16)
    plt.legend(loc='lower right', fontsize=35)
    plt.show()
    fig.savefig(
        "%s/%s/%scluster_homon_l%d_%s.pdf" % (directory, algorithm, "shuffled" if shuffled else '', level, label))


def plot_labels_size(clustersinfo, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'):
    fig = plt.figure(figsize=(10, 6))
    x = np.array(clustersinfo['sizes']).astype(float)
    y = np.array(clustersinfo['nclasses']).astype(int)
    plt.xlim(x.min() - 10, x.max() + 5)
    plt.ylim(y.min() - 2, y.max() + 5)
    shuffled = False
    plt.scatter(x, y, lw=10, alpha=0.9, label='clusters')
    if clustersinfo_shuffle is not None:
        x_shuffle = np.array(clustersinfo_shuffle['sizes']).astype(float)
        y_shuffle = np.array(clustersinfo_shuffle['nclasses']).astype(int)
        plt.scatter(x_shuffle, y_shuffle, lw=10, alpha=0.9, label='clusters shuffled')
        plt.xlim(x.min() - 10, x_shuffle.max() + 5)
        plt.ylim(y.min() - 2, y_shuffle.max() + 8)
        shuffled = True
    plt.xlabel("cluster size", fontsize=35)
    plt.ylabel("number of labels", fontsize=35)
    plt.legend(loc='upper right', fontsize=35)
    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=16)
    plt.show()
    fig.savefig(
        "%s/%s/%scluster_shuffle_label_size_l%d_%s.pdf" % (
            directory, algorithm, "shuffled" if shuffled else '', level, label))


def make_heatmap(fraction_sites, directory, label, level, shuffled=False, normalise=False, algorithm='topsbm'):
    sns.set(font_scale=2)
    found_classes = []
    for site, data in fraction_sites.items():
        if np.max(data) == 0:
            continue
        found_classes.append(site)
    for arr in fraction_sites.values():
        x = len(arr)
        break
    x = np.arange(1, 1 + x)
    fig = plt.figure(figsize=(30, 10))
    fig.subplots(1)
    sns.heatmap(pd.DataFrame(data=fraction_sites).loc[:, found_classes].transpose(), vmin=0, cmap="RdYlBu_r",
                xticklabels=x)
    fig.savefig("%s/%s/%sheatmap_cluster%s_l%d_%s.pdf" % (
        directory, algorithm, "shuffled" if shuffled else '', "fraction_" if normalise else '', int(level), label))


def get_file(sample, df_file):
    for fullsample in df_file.index.values:
        if sample in fullsample:
            return df_file.loc[fullsample, :]
    return None


def define_labels(cluster, df_files, label='primary_site', verbose=False):
    true_labels = []
    predicted_labels = []
    for c in cluster:
        if verbose:
            print(c)
        for sample in cluster[c]:
            try:
                true_labels.append(get_file(sample, df_files)[label])
                predicted_labels.append(c)
            except:
                true_labels.append('')
                predicted_labels.append('')
                print(*sys.exc_info())
                print("error searching %s in %s" % (label, sample))
    _, true_labels = np.unique(true_labels, return_inverse=True)
    return true_labels, predicted_labels


def add_score_lines(ax, scores, V="V", labels=None, h=False, c=False, alpha=0.8, **kwargs):
    '''
    add to ax lines in scores
    add homogeneity and completness if required by h and c
    '''
    colors = {
        'primary_site': 'blue',
        'hsbm': 'blue',
        'secondary_site': 'red',
        'status': 'red',
        'hSBM': 'blue',
        'mixed': 'green',
        'hierhsbm': 'purple',
        'hsbm->hierachical': 'purple',
        'disease_type': 'red',
        'shuffle': 'orange',
        'tm': 'darkcyan',
        'cc': 'darkred',
        'disease_tissue': 'purple',
        'hierarchical': 'darkgreen',
        'lda': 'violet',
        'RPPA Clusters': 'red',
        'wgcna': 'purple',
        "Subtype_Selected": "red",
        "BRCA_Subtype_PAM50": "blue"
    }

    for label in labels:
        if label not in scores.keys():
            print("No score for %s" % label)
            continue
        if label not in colors.keys():
            colors[label] = next(color_iterator)
        xl = scores[label]['xl']
        if h:
            ax.plot(xl, scores[label]['h'], ls='-.', c=colors[label], marker='x', lw=150, ms=45, alpha=alpha,
                    label='homogeneity - %s' % label)
        if c:
            ax.plot(xl, scores[label]['c'], ls=':', c=colors[label], marker='<', lw=10, ms=45, alpha=alpha,
                    label='completness - %s' % label)
        if len(scores[label][V]) == len(xl):
            ax.plot(xl, scores[label][V], label='%s' % label, ls='-', c=colors[label], marker='o', lw=20, ms=45,
                    **kwargs)
        else:
            raise (ValueError("xl has got wrong lenght"))
    customize_metric_plot(ax, xl)


def customize_metric_plot(ax, xl):
    ax.tick_params(labelsize=35, width=8, length=20)
    ax.tick_params(which="minor", labelsize=35, width=5, length=15)
    ax.set_xlabel("Number of clusters", fontsize=40)
    ax.set_ylabel("NMI score", fontsize=40)
    ax.set_ylim((0, 1.1))
    ax.set_xlim(1, np.max(xl) * 1.1)
    ax.set_xscale('log')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    # Put a legend to the right of the current axis
    ax.legend(loc='best', bbox_to_anchor=(1, 0.85), fontsize=35, ncol=1)


def plot_topic_size(directory, l, algorithm='topsbm'):
    df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, l))
    sizes = []
    for t in df_topics.columns:
        sizes.append(len(df_topics.loc[:, t].dropna()))
    bins = np.linspace(0.5, np.max(sizes) + 0.5, int((np.max(sizes) + 1) / (np.max(sizes) / 5)))
    bin_counts, bin_edges, _ = plt.hist(sizes, histtype='step', lw=2, bins=bins)
    fig = plt.figure()
    ax = fig.subplots()
    ax.set_title("[%d topics, level: %d]" % (len(df_topics.columns), l))
    x = (bin_edges[:-1] + bin_edges[1:]) / 2
    ax.plot(x[np.nonzero(bin_counts)], bin_counts[np.nonzero(bin_counts)])
    ax.plot(x, 1e4 / np.power(x, 5))
    ax.set_xlabel("topic size\n(number of genes)", fontsize=35)
    ax.set_ylabel("number of topic", fontsize=35)
    ax.set_xscale('log')
    ax.set_yscale('log')
    plt.show()
    fig.savefig("%s/%s/topic_size_level%d.png" % (directory, algorithm, l))


def get_candles(directory, level, df_mv, ax, algorithm='topsbm'):
    df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, level))
    candles = {
        'open': [],
        'high': [],
        'low': [],
        'close': [],
        'size': []
    }
    for topic in df_topics.columns:
        subarr = df_mv.loc[df_topics[topic].dropna(), :]['occurrence'].values
        avg = np.average(subarr)
        std = np.std(subarr)
        q = np.quantile(subarr, [0.25, 0.75])
        candles['high'].append(np.min([1, avg + std]))
        candles['open'].append(np.min([q[1], 1]))
        candles['close'].append(np.max([q[0], 0]))
        candles['low'].append(np.max([0, avg - std]))
        candles['size'].append(len(subarr))
    ax.set_title("[level: %d]" % level)
    ax.set_ylabel('$O_i$', fontsize=35)
    ax.set_xlim(-1, len(df_topics.columns))
    ax.set_xticks([i + 1 for i in range(-1, len(df_topics.columns))])
    ax.set_xticklabels(
        ["Topic %d" % (i + 2) if ((i + 2) % 5 == 0 or i == -1) else '' for i in range(-1, len(df_topics.columns))],
        rotation=60)
    return candles


def get_tissue_style(tissue):
    marker = 'o'
    c = 'k'
    ls = '--'
    if 'gtex' in tissue:
        marker = 'o'
        ls = '-'
    elif 'tcga' in tissue:
        marker = 'x'
        ls = '--'
    else:
        marker = '.'
        ls = '-.'
    if 'reast' in tissue:
        c = 'darkcyan'
    elif 'olon' in tissue:
        c = 'b'
    elif 'hyroid' in tissue:
        c = 'y'
    elif 'terus' in tissue:
        c = 'pink'
    elif 'ladder' in tissue:
        c = 'gray'
    elif 'sophagus' in tissue:
        c = 'brown'
    elif 'ung' in tissue:
        c = 'magenta'
    elif 'tomach' in tissue:
        c = 'lime'
    elif 'kin' in tissue:
        c = 'wheat'
    elif 'ancreas' in tissue:
        c = 'forestgreen'
    elif 'Adrenal Gland' in tissue:
        c = 'aqua'
    elif 'Adipose Tissue' in tissue:
        c = 'brown'
    elif 'erve' in tissue:
        c = 'royalblue'
    elif 'lood' in tissue:
        c = 'red'
    elif 'idney' in tissue:
        c = 'mediumslateblue'
    elif 'eart' in tissue:
        c = 'darkred'
    elif 'rain' in tissue:
        c = 'darkgray'
    elif 'estis' in tissue:
        c = 'darkkhaki'
    elif 'LumA' in tissue:
        c = 'pink'
    elif 'LumB' in tissue:
        c = 'purple'
    elif 'Normal' in tissue:
        c = 'blue'
    elif 'Basal' in tissue:
        c = 'darkred'
    elif 'Her2' in tissue:
        c = "green"
    else:
        c = 'k'
    return (marker, c, ls)


def topic_distr_sample(doc, df, ax=None):
    if ax == None:
        fig = plt.figure()
        ax = fig.subplots()
    ax.set_title("Topic distribution: %s" % doc)
    labels = [l if df[df['doc'] == doc].loc[:, l].values[0] >= 0.05 else '' for l in df.columns[2:]]
    patches, texts, autotexts = ax.pie(df[df['doc'] == doc].values[0][2:], labels=labels,
                                       autopct=lambda p: '%.1f%s' % (p, '%') if p >= 5 else '',
                                       textprops={'fontsize': 20, 'color': 'white', 'wrap': True})
    for t in texts:
        t.set_fontsize(18)
        t.set_wrap(True)
        t.set_color('black')
    plt.show()


def topic_distr_isample(idoc, df, ax=None):
    topic_distr_sample(df[df['i_doc'] == idoc]['doc'].values[0], ax)


def add_tumor_location(df_files):
    df_files.insert(2, 'disease_tissue', '')
    for sample in df_files.index.values:
        row = df_files.loc[sample, :]
        df_files.at[sample, 'disease_tissue'] = '%s[%s]' % (row['primary_site'], row['disease_type'])


def get_scores(directory, labels, df_files=None, algorithm='topsbm', verbose=False, metric=metrics.cluster.v_measure_score):
    if df_files is None:
        df_files = pd.read_csv("%s/files.dat" % directory, index_col=[0], header=[0]).dropna(how='all', axis=0)
    if df_files.columns.isin(['disease_type']).any():
        add_tumor_location(df_files)
    scores = {}
    for label in labels:
        xl = []
        scores[label] = {
            'h': [],
            'c': [],
            'V': [],
            'xl': []
        }
        l = get_max_available_L(directory, algorithm)
        for l in np.arange(l + 1):
            try:
                true_labels, predicted_labels = define_labels(get_cluster_given_l(l, directory, algorithm), df_files,
                                                              label=label)
                scores[label]['h'].append(metrics.cluster.homogeneity_score(true_labels, predicted_labels))
                scores[label]['c'].append(metrics.cluster.completeness_score(true_labels, predicted_labels))
                scores[label]['V'].append(metric(true_labels, predicted_labels))
                xl.append(len(np.unique(predicted_labels)))
                if verbose:
                    print(l)
            except:
                print(*sys.exc_info())
                print("Skipping level ", l)

        # add the first point where all sample are in the same cluster by definition
        if 1 not in xl:
            if xl[0] < xl[-1]:
                idx = 0
            else:
                idx = len(xl)
            true_labels, _ = define_labels(get_cluster_given_l(l, directory, algorithm), df_files, label=label)
            predicted_labels = np.ones_like(true_labels)
            scores[label]['h'].insert(idx, metrics.cluster.homogeneity_score(true_labels, predicted_labels))
            scores[label]['c'].insert(idx, metrics.cluster.completeness_score(true_labels, predicted_labels))
            scores[label]['V'].insert(idx, metric(true_labels, predicted_labels))
            xl.insert(idx, len(np.unique(predicted_labels)))

        scores[label]['xl'] = xl
    if len(labels) >= 2:
        h = np.array(scores[labels[0]]['h'])
        c = np.array(scores[labels[1]]['c'])
        scores['mixed'] = {
            'h': h,
            'c': c,
            'V': 2 * h * c / (h + c)
        }
    return scores


def shuffle_files(df_files, label, random_state=42):
    df_files_shuffled = df_files.copy()
    if label not in df_files.columns:
        raise (AttributeError(f"{label} non available in:{df_files.columns}"))
    df_files_shuffled[label] = shuffle(df_files_shuffled[label].values)
    return df_files_shuffled


def get_scores_shuffled(directory, df_files, algorithm='topsbm', label='primary_site', verbose=False, metric=metrics.cluster.v_measure_score):
    scores = {
        'h': [],
        'c': [],
        'V': [],
        'xl': []
    }
    xl = []
    l = get_max_available_L(directory, algorithm)
    df_files_shuffled = shuffle_files(df_files.copy(), label, random_state=42)
    try:
        for l in np.arange(0, l + 1):
            try:
                if verbose:
                    print(l)
                clusters = get_cluster_given_l(l, directory, algorithm=algorithm)
            except:
                print("Skipping shuffled level ", l)
                continue
            _, predicted_labels = define_labels(clusters, df_files, label=label)
            true_labels, _ = define_labels(clusters,
                                           df_files_shuffled,
                                           label=label)
            scores['h'].append(metrics.cluster.homogeneity_score(true_labels, predicted_labels))
            scores['c'].append(metrics.cluster.completeness_score(true_labels, predicted_labels))
            scores['V'].append(metric(true_labels, predicted_labels))
            xl.append(len(np.unique(predicted_labels)))
    except:
        print(*sys.exc_info())
        print("shuffled files not found")

    # add the first point where all sample are in the same cluster by definition
    if xl[0] < xl[-1]:
        idx = 0
    else:
        idx = len(xl)
    true_labels, _ = define_labels(get_cluster_given_l(l, directory, algorithm), df_files, label=label)
    predicted_labels = np.ones_like(true_labels)
    scores['h'].insert(idx, metrics.cluster.homogeneity_score(true_labels, predicted_labels))
    scores['c'].insert(idx, metrics.cluster.completeness_score(true_labels, predicted_labels))
    scores['V'].insert(idx, metric(true_labels, predicted_labels))
    xl.insert(idx, len(np.unique(predicted_labels)))

    scores['xl'] = xl
    return scores


def getclustersizesarray(directory, l=3, algorithm='topsbm'):
    try:
        xl = [len(get_cluster_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(0, l, l + 1)]
    except:
        try:
            xl = [len(get_cluster_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(1, l, l)]
        except:
            xl = []
            for li in np.linspace(1, l, l):
                try:
                    xl.append(len(get_cluster_given_l(li, directory, algorithm=algorithm)))
                except:
                    pass
    return xl


def gettopicsizesarray(directory, l=3, algorithm='topsbm'):
    xl = []
    try:
        xl = [len(get_topic_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(0, l, l + 1)]
    except:
        try:
            xl = [len(get_topic_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(1, l, l)]
        except:
            xl = []
            for li in np.linspace(1, l, l):
                try:
                    xl.append(len(get_topic_given_l(li, directory, algorithm=algorithm)))
                except:
                    pass
    return xl


def plot_sizes(level, directory, algorithm, ax=None):
    cluster = get_cluster_given_l(level, directory, algorithm=algorithm)
    if ax is None:
        fig = plt.figure(figsize=(10, 6))
        ax = fig.subplots()
    sizes = []
    for c in cluster.items():
        sizes.append(len(c[1]))
    ax.set_xlabel("size", fontsize=24)
    ax.set_ylabel("number of clusters", fontsize=24)
    ax.set_title("Cluster sizes at level %d" % level)
    ax.hist(sizes, histtype='step', lw=4)
    plt.savefig("%s/%s/sizes_distr_level%d.pdf" % (directory, algorithm, level))
    plt.show()


[docs]def clusteranalysis(directory, labels, algorithm='topsbm') -> None:
    """
    Perform analyses of an algorithm output

    :param directory: where to search the data
    :param labels: ground truth label to search. This should be in a file called directory/files.dat
    :param algorithm: name of the folder in which data are stored
    """
    l_max = get_max_available_L(directory, algorithm)
    df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv" % (directory, algorithm, algorithm, l_max), header=[0])
    if df_clusters is None:
        print("files not found")
    df_files = pd.read_csv("%s/files.dat" % directory, index_col=[0], header=[0]).dropna(axis=1, how='all').dropna(
        axis=0, how='all')
    samples = pd.read_csv("%s/%s/%s_level_0_clusters.csv" % (directory, algorithm, algorithm), header=[0]).astype(
        str).values.ravel()
    samples = samples[samples != "nan"]
    df_files = df_files.reindex(index=samples).dropna(how="all", axis=0).fillna("unknown")
    df_files_shuffled = df_files.copy()
    df_files_shuffled.apply(lambda x: np.random.shuffle(x), 0)
    for normalise in [True, False]:
        for label in labels:
            for level in np.arange(l_max + 1)[::-1]:
                print(normalise, label, level)
                try:
                    cluster = get_cluster_given_l(level, directory, algorithm=algorithm)
                    fraction_sites = get_fraction_sites(cluster, df_files=df_files, label=label, normalise=normalise)

                    clustersinfo = get_clustersinfo(cluster, fraction_sites)
                    plot_cluster_composition(fraction_sites, directory, level, label=label, normalise=normalise,
                                             algorithm=algorithm)
                    make_heatmap(fraction_sites, directory, label, level, normalise=normalise, algorithm=algorithm)

                    clustersinfo = get_clustersinfo(cluster, fraction_sites)
                    if not normalise:
                        plot_maximum(clustersinfo, cluster, label, level, directory, algorithm=algorithm)
                        plot_maximum_size(clustersinfo, label, level, directory, algorithm=algorithm)
                        plot_maximum_label(clustersinfo, label, level, directory, algorithm=algorithm)
                        plot_sizes(level, directory, algorithm=algorithm)
                except:
                    print(*sys.exc_info())
                try:
                    shuffle_files(df_files,label).to_csv("%s/files_shuffles.dat"%directory,index=True)
                    fraction_sites_shuffle = get_fraction_sites(cluster, df_files=pd.read_csv("%s/files_shuffles.dat"%directory,index_col=[0]),label=label, normalise=normalise)
                    clustersinfo_shuffle = get_clustersinfo(cluster, fraction_sites_shuffle)
                    plot_cluster_composition(fraction_sites_shuffle,directory,level, label=label, shuffled=True, normalise=normalise, algorithm=algorithm)
                    if not normalise:
                        plot_maximum(clustersinfo,cluster,label,level,directory,clustersinfo_shuffle,algorithm=algorithm)
                        plot_maximum_size(clustersinfo,label,level, directory,clustersinfo_shuffle,algorithm=algorithm)
                        plot_maximum_label(clustersinfo,label,level, directory,clustersinfo_shuffle,algorithm=algorithm)
                        plot_labels_size(clustersinfo,label,level, directory,clustersinfo_shuffle,algorithm=algorithm)
                except:
                    print(*sys.exc_info())
    ##define scores
    scores = get_scores(directory, labels, algorithm=algorithm)
    try:
        xl = getclustersizesarray(directory, l_max)
        with open("%s/clustersizes.txt" % directory, 'w') as f:
            for x in xl:
                f.write("%d\n" % x)
    except:
        print("cannot save clustersizes.txt")

    try:
        xl = gettopicsizesarray(directory, l_max)
        with open("%s/topicsizes.txt" % directory, 'w') as f:
            for x in xl:
                f.write("%d\n" % x)
    except:
        print("cannot save topicsizes.txt")

    # save files for R analisys
    for l_max in np.arange(l_max + 1):
        pd.DataFrame(data=define_labels(get_cluster_given_l(l_max, directory, algorithm=algorithm), df_files, label=labels[0])[1],
                     columns=['l%d' % l_max]).to_csv("%s/%s/%s_level_%d_labels.csv" % (directory, algorithm, algorithm, l_max),
                                                     header=True,index=False)



def get_max_available_L(directory, algorithm='topsbm'):
    """
    Get maximum layer available for algorithm
    """
    return np.array([el.split("_")[2] for el in os.listdir("%s/%s" % (directory, algorithm)) if "level_" in el],
                    dtype=int).max()


def out_to_file(out, index, name='new_method', l=0):
    print("saving clusters")
    df_clusters = pd.DataFrame(index=np.arange(len(index)))
    for c in np.arange(out.max() + 1)[::-1]:
        c_objects = index[np.argwhere(out == c)].values.T[0]
        df_clusters.insert(0, "Cluster %d" % (c + 1),
                           np.concatenate((c_objects, [np.nan for _ in np.arange(len(index) - len(c_objects))])))
    df_clusters.dropna(axis=0, how='all', inplace=True)
    df_clusters.to_csv("%s_level_%d_clusters.csv" % (name, l), index=False, header=True)


# normalise to hsbm
def normalise_score(scores: dict, base_algorithm="hsbm", operation=lambda x, y: x / y, epsilon=1e-6) -> None:
    "save scaled data to scores[norm_V]"
    for algorithm in scores.keys():  # the first point is always constructed and np.interp wants sorted data so[:-1:-1]
        baseline = np.interp(scores[algorithm]["xl"],
                             scores[base_algorithm]["xl"][:-1][::-1],
                             scores[base_algorithm]["V"][:-1][::-1])
        scores[algorithm]["norm_V"] = operation(np.array(scores[algorithm]["V"]) + epsilon, baseline + epsilon)