Source code for topicpy.hsbmpy.hsbmpy

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.
import os

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
import sys
import seaborn as sns

sns.set()
sns.set_context("paper")
from sklearn import metrics

[docs]class painter(): def __init__(self): """ Painter iteator over list of colors :return : painter Usage ===== p = painter() next(p) """ # get colors from https://medialab.github.io/iwanthue/ or artenatevly from http://phrogz.net/css/distinct-colors.html self.colors_cycle = ["#a257d4", "#e090bf", "#64c9a3", "#4b68ae", "#dc8c2f", "#cd41a7", "#d9344f", "#bc599a", "#afa1e8", "#48c1d8", "#b54545", "#919233", "#9a78be", "#59602a", "#4e8e2c", "#9db935", "#9b563c", "#e482df", "#5995d3", "#6a5198", "#b05f84", "#b563c3", "#5f6b18", "#a55c21", "#5754c2", "#277257", "#4f9b5e", "#8b6b29", "#b8381c", "#ad2f62", "#97ba6d", "#45c37c", "#5fc250", "#8c4c7b", "#e06e87", "#e2672a", "#db7756", "#974858", "#35743b", "#bbaf6c", "#8c4099", "#e44586", "#ed5c4c", "#389c84", "#cfae3d", "#eda377", "#778749", "#c5935a", "#de8784", "#757eec"] self.available_colors = len(self.colors_cycle) self._index = -1 def __iter__(self): self._index = -1 return self def __next__(self): self._index += 1 if self._index >= self.available_colors: self._index = 0 return self.colors_cycle[self._index]
color_iterator = painter() def plot_cluster_composition(fraction_sites, directory, level, normalise=False, label='primary_site', shuffled=False, algorithm='topsbm'): sns.set(font_scale=0.8) df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv" % (directory, algorithm, algorithm, level), header=[0]) x = np.arange(1, 1 + len(df_clusters.columns)) fig = plt.figure(figsize=(25, 15)) ax = fig.subplots() fraction_bar_plot(x, fraction_sites, ax) ax.set_xlabel("cluster", fontsize=35) if normalise: ax.set_ylabel("fraction of nodes", fontsize=35) else: ax.set_ylabel("number of nodes", fontsize=35) ax.set_title("%s%s distribution across clusters" % ("Shuffled " if shuffled else '', label), fontsize=35) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis n_labels = len(fraction_sites) n_col = round(n_labels / 20) if n_labels > 20 else 1 ax.legend(loc='best', bbox_to_anchor=(1, 0.99), fontsize=35, ncol=n_col) ax.tick_params(axis='both', labelsize=35) plt.show() fig.savefig("%s/%s/%s%sclustercomposition_l%d_%s.pdf" % ( directory, algorithm, "shuffled" if shuffled else '', "fraction_" if normalise else '', int(level), label)) def fraction_bar_plot(x, fraction_sites, ax=None): global current_color current_color = -1 if ax is None: fig = plt.figure(figsize=(15, 8)) ax = fig.subplots() bottom = np.zeros(len(x)) color_iterator = painter() for site, data in fraction_sites.items(): if np.max(data) == 0: continue ax.bar(x, data, label=site, bottom=bottom, color=next(color_iterator)) bottom = bottom + data def get_Palette(site): palette_map = dict({'Brain': 'Blues', 'Breast': 'Reds', 'Kidney': 'Greens', 'Lung': 'Oranges', 'Thyroid': 'Greys', 'Uterus': 'Purples', 'Prostate': 'BuGn', 'Ovary': 'BuPu', 'Lymph Nodes': 'OrRd', 'Soft Tissue': 'PuRd', 'Esophagus': 'YlGn', 'Stomach': 'YlRd', 'Bone Marrow': 'PuBuGn', 'Skin': 'YlOrRd', 'Adipose Tissue': 'YlOrBr', 'Blood': 'RdPu', 'Pancreas': 'OrRd', 'Testis': 'GnBu'}) for k in palette_map.keys(): if k in site: return palette_map[k] def get_cluster_given_l(l, directory, algorithm='topsbm'): df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv" % (directory, algorithm, algorithm, l), header=[0], index_col=None) cluster = {} for i, c in enumerate(df_clusters.columns): cluster[i] = df_clusters[c].dropna().values return cluster def get_topic_given_l(l, directory, algorithm='topsbm'): df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, l), header=[0]) topic = {} for i, c in enumerate(df_topics.columns): topic[i] = df_topics[c].dropna().values return topic def get_fraction_sites(cluster, df_files, label='primary_site', normalise=False): fraction_sites = {} c_fraction_site = {} for site in np.concatenate([df_files[label].dropna().unique(), ["unknown"]]): fraction_sites[site] = [] c_fraction_site[site] = 0 for i, c in enumerate(cluster): for sample in cluster[i]: foundsample = get_file(sample, df_files) if foundsample is not None: c_fraction_site[foundsample[label]] += 1 else: c_fraction_site['unknown'] += 1 for site in fraction_sites.keys(): if normalise: norm = float(len(cluster[i])) else: norm = 1 if norm > 0: fraction_sites[site].append(c_fraction_site[site] / norm) else: fraction_sites[site].append(np.nan) c_fraction_site[site] = 0 df = pd.DataFrame(data=fraction_sites).dropna(how='all', axis=0) ##put first columns that have high values in average avgs = df.apply(lambda x: np.average( x.to_numpy()[x.to_numpy().nonzero()[0]]), axis=0) df = df.transpose() df.insert(0, 'avg', avgs) df = df.sort_values(by=['avg'], axis=0, ascending=False).drop( 'avg', axis=1).transpose() df = df.sort_values( by=[tissue for tissue in df.columns], axis=0, ascending=False) return df.sort_index(1).to_dict(orient='list') def get_clustersinfo(cluster, fraction_sites): clustersinfo = { "maximum": [], "homogeneity": [], "sizes": [], "nclasses": [] } for icluster in cluster: maximum = 0 homo = 0 size = 0 nclass = 0 site_maximum = '' cumulative = 0 for site, data in fraction_sites.items(): cdata = data[icluster] cumulative += cdata if cdata > maximum: maximum = cdata site_maximum = site if cdata > 0: nclass += 1 # using fraction_items normalised if cdata <= 1: homo -= cdata * np.log(cdata) size += cdata if cumulative > 0: clustersinfo['maximum'].append([float(maximum) / cumulative, site_maximum]) else: clustersinfo['maximum'].append([0, site_maximum]) clustersinfo['sizes'].append(size) clustersinfo['nclasses'].append(nclass) clustersinfo['homogeneity'].append(1 - homo) return clustersinfo def plot_maximum(clustersinfo, cluster, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'): fig = plt.figure(figsize=(15, 6)) ax = fig.subplots(1, 2) bins = 10 real = np.array(clustersinfo['maximum'])[:, 0].astype(float) ax[0].plot(np.sort(real), marker='o', ms=25, ls='') ax[1].hist(np.sort(real), histtype='step', bins=bins, lw=4, density=True, range=(0.05, 1.05)) shuffled = False if clustersinfo_shuffle is not None: shuffled = np.array(clustersinfo_shuffle['maximum'])[:, 0].astype(float) ax[0].plot(np.sort(shuffled), marker='o', ls='', ms=25) ax[1].hist(np.sort(shuffled), histtype='step', bins=bins, lw=4, density=True, range=(0.05, 1.05)) shuffled = True ax[0].plot(np.arange(len(cluster)), [0.8 for i in range(len(cluster))], visible=True, ls='--') for axi in ax: axi.tick_params(axis='both', labelsize=20) ax[0].set_xlabel("cluster", fontsize=35) ax[0].set_ylabel("maximum fraction\nwith same %s" % label, fontsize=35) ax[0].set_ylim((0, 1.1)) ax[1].set_xlabel("maximum fraction\nwith same %s" % label, fontsize=35) ax[1].set_ylabel("pdf", fontsize=35) plt.rc('xtick', labelsize=18) plt.rc('ytick', labelsize=18) plt.show() fig.savefig( "%s/%s/%scluster_maximum_l%d_%s.pdf" % (directory, algorithm, "shuffled" if shuffled else '', level, label)) def plot_maximum_size(clustersinfo, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'): fig = plt.figure(figsize=(15, 6)) x = np.array(clustersinfo['sizes']).astype(int) y = np.array(clustersinfo['maximum'])[:, 0].astype(float) plt.scatter(x, y, lw=10, label='clusters') plt.xlim(0, np.max(x) + np.max(x) / 10) plt.plot(np.linspace(0.5, x.max()), 1. / np.linspace(0.5, x.max()), label='uniform') shuffled = False if clustersinfo_shuffle is not None: shuffled = True x_shuffle = np.array(clustersinfo_shuffle['sizes']).astype(int) y_shuffle = np.array(clustersinfo_shuffle['maximum'])[:, 0].astype(float) plt.scatter(x_shuffle, y_shuffle, lw=10, label='clusters shuffled') plt.xlim(0, np.max(x_shuffle) + np.max(x_shuffle) / 10) plt.xlabel("cluster size", fontsize=35) plt.ylabel("maximum fraction\nwith same %s" % label, fontsize=35) plt.ylim((0, 1.1)) plt.legend(loc='best', fontsize=35) plt.rc('xtick', labelsize=18) plt.rc('ytick', labelsize=18) plt.show() fig.savefig( "%s/%s/%sclusterhomosize_l%d_%s.pdf" % (directory, algorithm, "shuffled" if shuffled else '', level, label)) def plot_maximum_label(clustersinfo, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'): fig = plt.figure(figsize=(10, 6)) x = np.array(clustersinfo['nclasses']).astype(int) y = np.array(clustersinfo['maximum'])[:, 0].astype(float) shuffled = False plt.scatter(x, y, lw=10, alpha=0.9, label='clusters') plt.plot(np.arange(1, np.max(x) + 2), 1. / np.arange(1, np.max(x) + 2), ls='--', c='cyan', label='uniform') plt.xlim(0.95, np.max(x) + 0.5) if clustersinfo_shuffle is not None: x_shuffle = np.array(clustersinfo_shuffle['nclasses']).astype(int) y_shuffle = np.array(clustersinfo_shuffle['maximum'])[:, 0].astype(float) plt.scatter(x_shuffle, y_shuffle, lw=10, alpha=0.9, label='clusters shuffled') plt.plot(np.arange(1, np.max(x_shuffle) + 2), 1. / np.arange(1, np.max(x_shuffle) + 2), ls='--', c='cyan', label='') shuffled = True plt.xlim(0.95, np.max(x_shuffle) + 0.5) plt.xlabel("number of labels", fontsize=35) plt.ylabel("maximum fraction\nwith same %s" % label, fontsize=35) plt.ylim((0, 1.1)) plt.rc('xtick', labelsize=16) plt.rc('ytick', labelsize=16) plt.legend(loc='lower right', fontsize=35) plt.show() fig.savefig( "%s/%s/%scluster_homon_l%d_%s.pdf" % (directory, algorithm, "shuffled" if shuffled else '', level, label)) def plot_labels_size(clustersinfo, label, level, directory, clustersinfo_shuffle=None, algorithm='topsbm'): fig = plt.figure(figsize=(10, 6)) x = np.array(clustersinfo['sizes']).astype(float) y = np.array(clustersinfo['nclasses']).astype(int) plt.xlim(x.min() - 10, x.max() + 5) plt.ylim(y.min() - 2, y.max() + 5) shuffled = False plt.scatter(x, y, lw=10, alpha=0.9, label='clusters') if clustersinfo_shuffle is not None: x_shuffle = np.array(clustersinfo_shuffle['sizes']).astype(float) y_shuffle = np.array(clustersinfo_shuffle['nclasses']).astype(int) plt.scatter(x_shuffle, y_shuffle, lw=10, alpha=0.9, label='clusters shuffled') plt.xlim(x.min() - 10, x_shuffle.max() + 5) plt.ylim(y.min() - 2, y_shuffle.max() + 8) shuffled = True plt.xlabel("cluster size", fontsize=35) plt.ylabel("number of labels", fontsize=35) plt.legend(loc='upper right', fontsize=35) plt.rc('xtick', labelsize=16) plt.rc('ytick', labelsize=16) plt.show() fig.savefig( "%s/%s/%scluster_shuffle_label_size_l%d_%s.pdf" % ( directory, algorithm, "shuffled" if shuffled else '', level, label)) def make_heatmap(fraction_sites, directory, label, level, shuffled=False, normalise=False, algorithm='topsbm'): sns.set(font_scale=2) found_classes = [] for site, data in fraction_sites.items(): if np.max(data) == 0: continue found_classes.append(site) for arr in fraction_sites.values(): x = len(arr) break x = np.arange(1, 1 + x) fig = plt.figure(figsize=(30, 10)) fig.subplots(1) sns.heatmap(pd.DataFrame(data=fraction_sites).loc[:, found_classes].transpose(), vmin=0, cmap="RdYlBu_r", xticklabels=x) fig.savefig("%s/%s/%sheatmap_cluster%s_l%d_%s.pdf" % ( directory, algorithm, "shuffled" if shuffled else '', "fraction_" if normalise else '', int(level), label)) def get_file(sample, df_file): for fullsample in df_file.index.values: if sample in fullsample: return df_file.loc[fullsample, :] return None def define_labels(cluster, df_files, label='primary_site', verbose=False): true_labels = [] predicted_labels = [] for c in cluster: if verbose: print(c) for sample in cluster[c]: try: true_labels.append(get_file(sample, df_files)[label]) predicted_labels.append(c) except: true_labels.append('') predicted_labels.append('') print(*sys.exc_info()) print("error searching %s in %s" % (label, sample)) _, true_labels = np.unique(true_labels, return_inverse=True) return true_labels, predicted_labels def add_score_lines(ax, scores, V="V", labels=None, h=False, c=False, alpha=0.8, **kwargs): ''' add to ax lines in scores add homogeneity and completness if required by h and c ''' colors = { 'primary_site': 'blue', 'hsbm': 'blue', 'secondary_site': 'red', 'status': 'red', 'hSBM': 'blue', 'mixed': 'green', 'hierhsbm': 'purple', 'hsbm->hierachical': 'purple', 'disease_type': 'red', 'shuffle': 'orange', 'tm': 'darkcyan', 'cc': 'darkred', 'disease_tissue': 'purple', 'hierarchical': 'darkgreen', 'lda': 'violet', 'RPPA Clusters': 'red', 'wgcna': 'purple', "Subtype_Selected": "red", "BRCA_Subtype_PAM50": "blue" } for label in labels: if label not in scores.keys(): print("No score for %s" % label) continue if label not in colors.keys(): colors[label] = next(color_iterator) xl = scores[label]['xl'] if h: ax.plot(xl, scores[label]['h'], ls='-.', c=colors[label], marker='x', lw=150, ms=45, alpha=alpha, label='homogeneity - %s' % label) if c: ax.plot(xl, scores[label]['c'], ls=':', c=colors[label], marker='<', lw=10, ms=45, alpha=alpha, label='completness - %s' % label) if len(scores[label][V]) == len(xl): ax.plot(xl, scores[label][V], label='%s' % label, ls='-', c=colors[label], marker='o', lw=20, ms=45, **kwargs) else: raise (ValueError("xl has got wrong lenght")) customize_metric_plot(ax, xl) def customize_metric_plot(ax, xl): ax.tick_params(labelsize=35, width=8, length=20) ax.tick_params(which="minor", labelsize=35, width=5, length=15) ax.set_xlabel("Number of clusters", fontsize=40) ax.set_ylabel("NMI score", fontsize=40) ax.set_ylim((0, 1.1)) ax.set_xlim(1, np.max(xl) * 1.1) ax.set_xscale('log') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc='best', bbox_to_anchor=(1, 0.85), fontsize=35, ncol=1) def plot_topic_size(directory, l, algorithm='topsbm'): df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, l)) sizes = [] for t in df_topics.columns: sizes.append(len(df_topics.loc[:, t].dropna())) bins = np.linspace(0.5, np.max(sizes) + 0.5, int((np.max(sizes) + 1) / (np.max(sizes) / 5))) bin_counts, bin_edges, _ = plt.hist(sizes, histtype='step', lw=2, bins=bins) fig = plt.figure() ax = fig.subplots() ax.set_title("[%d topics, level: %d]" % (len(df_topics.columns), l)) x = (bin_edges[:-1] + bin_edges[1:]) / 2 ax.plot(x[np.nonzero(bin_counts)], bin_counts[np.nonzero(bin_counts)]) ax.plot(x, 1e4 / np.power(x, 5)) ax.set_xlabel("topic size\n(number of genes)", fontsize=35) ax.set_ylabel("number of topic", fontsize=35) ax.set_xscale('log') ax.set_yscale('log') plt.show() fig.savefig("%s/%s/topic_size_level%d.png" % (directory, algorithm, l)) def get_candles(directory, level, df_mv, ax, algorithm='topsbm'): df_topics = pd.read_csv("%s/%s/%s_level_%d_topics.csv" % (directory, algorithm, algorithm, level)) candles = { 'open': [], 'high': [], 'low': [], 'close': [], 'size': [] } for topic in df_topics.columns: subarr = df_mv.loc[df_topics[topic].dropna(), :]['occurrence'].values avg = np.average(subarr) std = np.std(subarr) q = np.quantile(subarr, [0.25, 0.75]) candles['high'].append(np.min([1, avg + std])) candles['open'].append(np.min([q[1], 1])) candles['close'].append(np.max([q[0], 0])) candles['low'].append(np.max([0, avg - std])) candles['size'].append(len(subarr)) ax.set_title("[level: %d]" % level) ax.set_ylabel('$O_i$', fontsize=35) ax.set_xlim(-1, len(df_topics.columns)) ax.set_xticks([i + 1 for i in range(-1, len(df_topics.columns))]) ax.set_xticklabels( ["Topic %d" % (i + 2) if ((i + 2) % 5 == 0 or i == -1) else '' for i in range(-1, len(df_topics.columns))], rotation=60) return candles def get_tissue_style(tissue): marker = 'o' c = 'k' ls = '--' if 'gtex' in tissue: marker = 'o' ls = '-' elif 'tcga' in tissue: marker = 'x' ls = '--' else: marker = '.' ls = '-.' if 'reast' in tissue: c = 'darkcyan' elif 'olon' in tissue: c = 'b' elif 'hyroid' in tissue: c = 'y' elif 'terus' in tissue: c = 'pink' elif 'ladder' in tissue: c = 'gray' elif 'sophagus' in tissue: c = 'brown' elif 'ung' in tissue: c = 'magenta' elif 'tomach' in tissue: c = 'lime' elif 'kin' in tissue: c = 'wheat' elif 'ancreas' in tissue: c = 'forestgreen' elif 'Adrenal Gland' in tissue: c = 'aqua' elif 'Adipose Tissue' in tissue: c = 'brown' elif 'erve' in tissue: c = 'royalblue' elif 'lood' in tissue: c = 'red' elif 'idney' in tissue: c = 'mediumslateblue' elif 'eart' in tissue: c = 'darkred' elif 'rain' in tissue: c = 'darkgray' elif 'estis' in tissue: c = 'darkkhaki' elif 'LumA' in tissue: c = 'pink' elif 'LumB' in tissue: c = 'purple' elif 'Normal' in tissue: c = 'blue' elif 'Basal' in tissue: c = 'darkred' elif 'Her2' in tissue: c = "green" else: c = 'k' return (marker, c, ls) def topic_distr_sample(doc, df, ax=None): if ax == None: fig = plt.figure() ax = fig.subplots() ax.set_title("Topic distribution: %s" % doc) labels = [l if df[df['doc'] == doc].loc[:, l].values[0] >= 0.05 else '' for l in df.columns[2:]] patches, texts, autotexts = ax.pie(df[df['doc'] == doc].values[0][2:], labels=labels, autopct=lambda p: '%.1f%s' % (p, '%') if p >= 5 else '', textprops={'fontsize': 20, 'color': 'white', 'wrap': True}) for t in texts: t.set_fontsize(18) t.set_wrap(True) t.set_color('black') plt.show() def topic_distr_isample(idoc, df, ax=None): topic_distr_sample(df[df['i_doc'] == idoc]['doc'].values[0], ax) def add_tumor_location(df_files): df_files.insert(2, 'disease_tissue', '') for sample in df_files.index.values: row = df_files.loc[sample, :] df_files.at[sample, 'disease_tissue'] = '%s[%s]' % (row['primary_site'], row['disease_type']) def get_scores(directory, labels, df_files=None, algorithm='topsbm', verbose=False, metric=metrics.cluster.v_measure_score): if df_files is None: df_files = pd.read_csv("%s/files.dat" % directory, index_col=[0], header=[0]).dropna(how='all', axis=0) if df_files.columns.isin(['disease_type']).any(): add_tumor_location(df_files) scores = {} for label in labels: xl = [] scores[label] = { 'h': [], 'c': [], 'V': [], 'xl': [] } l = get_max_available_L(directory, algorithm) for l in np.arange(l + 1): try: true_labels, predicted_labels = define_labels(get_cluster_given_l(l, directory, algorithm), df_files, label=label) scores[label]['h'].append(metrics.cluster.homogeneity_score(true_labels, predicted_labels)) scores[label]['c'].append(metrics.cluster.completeness_score(true_labels, predicted_labels)) scores[label]['V'].append(metric(true_labels, predicted_labels)) xl.append(len(np.unique(predicted_labels))) if verbose: print(l) except: print(*sys.exc_info()) print("Skipping level ", l) # add the first point where all sample are in the same cluster by definition if 1 not in xl: if xl[0] < xl[-1]: idx = 0 else: idx = len(xl) true_labels, _ = define_labels(get_cluster_given_l(l, directory, algorithm), df_files, label=label) predicted_labels = np.ones_like(true_labels) scores[label]['h'].insert(idx, metrics.cluster.homogeneity_score(true_labels, predicted_labels)) scores[label]['c'].insert(idx, metrics.cluster.completeness_score(true_labels, predicted_labels)) scores[label]['V'].insert(idx, metric(true_labels, predicted_labels)) xl.insert(idx, len(np.unique(predicted_labels))) scores[label]['xl'] = xl if len(labels) >= 2: h = np.array(scores[labels[0]]['h']) c = np.array(scores[labels[1]]['c']) scores['mixed'] = { 'h': h, 'c': c, 'V': 2 * h * c / (h + c) } return scores def shuffle_files(df_files, label, random_state=42): df_files_shuffled = df_files.copy() if label not in df_files.columns: raise (AttributeError(f"{label} non available in:{df_files.columns}")) df_files_shuffled[label] = shuffle(df_files_shuffled[label].values) return df_files_shuffled def get_scores_shuffled(directory, df_files, algorithm='topsbm', label='primary_site', verbose=False, metric=metrics.cluster.v_measure_score): scores = { 'h': [], 'c': [], 'V': [], 'xl': [] } xl = [] l = get_max_available_L(directory, algorithm) df_files_shuffled = shuffle_files(df_files.copy(), label, random_state=42) try: for l in np.arange(0, l + 1): try: if verbose: print(l) clusters = get_cluster_given_l(l, directory, algorithm=algorithm) except: print("Skipping shuffled level ", l) continue _, predicted_labels = define_labels(clusters, df_files, label=label) true_labels, _ = define_labels(clusters, df_files_shuffled, label=label) scores['h'].append(metrics.cluster.homogeneity_score(true_labels, predicted_labels)) scores['c'].append(metrics.cluster.completeness_score(true_labels, predicted_labels)) scores['V'].append(metric(true_labels, predicted_labels)) xl.append(len(np.unique(predicted_labels))) except: print(*sys.exc_info()) print("shuffled files not found") # add the first point where all sample are in the same cluster by definition if xl[0] < xl[-1]: idx = 0 else: idx = len(xl) true_labels, _ = define_labels(get_cluster_given_l(l, directory, algorithm), df_files, label=label) predicted_labels = np.ones_like(true_labels) scores['h'].insert(idx, metrics.cluster.homogeneity_score(true_labels, predicted_labels)) scores['c'].insert(idx, metrics.cluster.completeness_score(true_labels, predicted_labels)) scores['V'].insert(idx, metric(true_labels, predicted_labels)) xl.insert(idx, len(np.unique(predicted_labels))) scores['xl'] = xl return scores def getclustersizesarray(directory, l=3, algorithm='topsbm'): try: xl = [len(get_cluster_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(0, l, l + 1)] except: try: xl = [len(get_cluster_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(1, l, l)] except: xl = [] for li in np.linspace(1, l, l): try: xl.append(len(get_cluster_given_l(li, directory, algorithm=algorithm))) except: pass return xl def gettopicsizesarray(directory, l=3, algorithm='topsbm'): xl = [] try: xl = [len(get_topic_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(0, l, l + 1)] except: try: xl = [len(get_topic_given_l(li, directory, algorithm=algorithm)) for li in np.linspace(1, l, l)] except: xl = [] for li in np.linspace(1, l, l): try: xl.append(len(get_topic_given_l(li, directory, algorithm=algorithm))) except: pass return xl def plot_sizes(level, directory, algorithm, ax=None): cluster = get_cluster_given_l(level, directory, algorithm=algorithm) if ax is None: fig = plt.figure(figsize=(10, 6)) ax = fig.subplots() sizes = [] for c in cluster.items(): sizes.append(len(c[1])) ax.set_xlabel("size", fontsize=24) ax.set_ylabel("number of clusters", fontsize=24) ax.set_title("Cluster sizes at level %d" % level) ax.hist(sizes, histtype='step', lw=4) plt.savefig("%s/%s/sizes_distr_level%d.pdf" % (directory, algorithm, level)) plt.show()
[docs]def clusteranalysis(directory, labels, algorithm='topsbm') -> None: """ Perform analyses of an algorithm output :param directory: where to search the data :param labels: ground truth label to search. This should be in a file called directory/files.dat :param algorithm: name of the folder in which data are stored """ l_max = get_max_available_L(directory, algorithm) df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv" % (directory, algorithm, algorithm, l_max), header=[0]) if df_clusters is None: print("files not found") df_files = pd.read_csv("%s/files.dat" % directory, index_col=[0], header=[0]).dropna(axis=1, how='all').dropna( axis=0, how='all') samples = pd.read_csv("%s/%s/%s_level_0_clusters.csv" % (directory, algorithm, algorithm), header=[0]).astype( str).values.ravel() samples = samples[samples != "nan"] df_files = df_files.reindex(index=samples).dropna(how="all", axis=0).fillna("unknown") df_files_shuffled = df_files.copy() df_files_shuffled.apply(lambda x: np.random.shuffle(x), 0) for normalise in [True, False]: for label in labels: for level in np.arange(l_max + 1)[::-1]: print(normalise, label, level) try: cluster = get_cluster_given_l(level, directory, algorithm=algorithm) fraction_sites = get_fraction_sites(cluster, df_files=df_files, label=label, normalise=normalise) clustersinfo = get_clustersinfo(cluster, fraction_sites) plot_cluster_composition(fraction_sites, directory, level, label=label, normalise=normalise, algorithm=algorithm) make_heatmap(fraction_sites, directory, label, level, normalise=normalise, algorithm=algorithm) clustersinfo = get_clustersinfo(cluster, fraction_sites) if not normalise: plot_maximum(clustersinfo, cluster, label, level, directory, algorithm=algorithm) plot_maximum_size(clustersinfo, label, level, directory, algorithm=algorithm) plot_maximum_label(clustersinfo, label, level, directory, algorithm=algorithm) plot_sizes(level, directory, algorithm=algorithm) except: print(*sys.exc_info()) try: shuffle_files(df_files,label).to_csv("%s/files_shuffles.dat"%directory,index=True) fraction_sites_shuffle = get_fraction_sites(cluster, df_files=pd.read_csv("%s/files_shuffles.dat"%directory,index_col=[0]),label=label, normalise=normalise) clustersinfo_shuffle = get_clustersinfo(cluster, fraction_sites_shuffle) plot_cluster_composition(fraction_sites_shuffle,directory,level, label=label, shuffled=True, normalise=normalise, algorithm=algorithm) if not normalise: plot_maximum(clustersinfo,cluster,label,level,directory,clustersinfo_shuffle,algorithm=algorithm) plot_maximum_size(clustersinfo,label,level, directory,clustersinfo_shuffle,algorithm=algorithm) plot_maximum_label(clustersinfo,label,level, directory,clustersinfo_shuffle,algorithm=algorithm) plot_labels_size(clustersinfo,label,level, directory,clustersinfo_shuffle,algorithm=algorithm) except: print(*sys.exc_info()) ##define scores scores = get_scores(directory, labels, algorithm=algorithm) try: xl = getclustersizesarray(directory, l_max) with open("%s/clustersizes.txt" % directory, 'w') as f: for x in xl: f.write("%d\n" % x) except: print("cannot save clustersizes.txt") try: xl = gettopicsizesarray(directory, l_max) with open("%s/topicsizes.txt" % directory, 'w') as f: for x in xl: f.write("%d\n" % x) except: print("cannot save topicsizes.txt") # save files for R analisys for l_max in np.arange(l_max + 1): pd.DataFrame(data=define_labels(get_cluster_given_l(l_max, directory, algorithm=algorithm), df_files, label=labels[0])[1], columns=['l%d' % l_max]).to_csv("%s/%s/%s_level_%d_labels.csv" % (directory, algorithm, algorithm, l_max), header=True,index=False)
def get_max_available_L(directory, algorithm='topsbm'): """ Get maximum layer available for algorithm """ return np.array([el.split("_")[2] for el in os.listdir("%s/%s" % (directory, algorithm)) if "level_" in el], dtype=int).max() def out_to_file(out, index, name='new_method', l=0): print("saving clusters") df_clusters = pd.DataFrame(index=np.arange(len(index))) for c in np.arange(out.max() + 1)[::-1]: c_objects = index[np.argwhere(out == c)].values.T[0] df_clusters.insert(0, "Cluster %d" % (c + 1), np.concatenate((c_objects, [np.nan for _ in np.arange(len(index) - len(c_objects))]))) df_clusters.dropna(axis=0, how='all', inplace=True) df_clusters.to_csv("%s_level_%d_clusters.csv" % (name, l), index=False, header=True) # normalise to hsbm def normalise_score(scores: dict, base_algorithm="hsbm", operation=lambda x, y: x / y, epsilon=1e-6) -> None: "save scaled data to scores[norm_V]" for algorithm in scores.keys(): # the first point is always constructed and np.interp wants sorted data so[:-1:-1] baseline = np.interp(scores[algorithm]["xl"], scores[base_algorithm]["xl"][:-1][::-1], scores[base_algorithm]["V"][:-1][::-1]) scores[algorithm]["norm_V"] = operation(np.array(scores[algorithm]["V"]) + epsilon, baseline + epsilon)