Source code for topicpy.TCGA_files.TCGA_files

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.

import requests as rq
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

cases_endpt = 'https://api.gdc.cancer.gov/cases'

fields = [
    "submitter_id",
    "case_id",
    "primary_site",
    "disease_type",
    "project.project_id",
    "diagnoses.tumor_stage",
    "diagnoses.tumor_grade",
    "diagnoses.primary_diagnosis",
    "diagnoses.classification_of_tumor",
    "annotations.classification",
    "samples.tumor_code",
    "samples.tumor_descriptor",
    "annotations.case_id",
    ]

fields = ','.join(fields)

[docs]def queryFile(idFile): """ Get information for file :param str idFile: file TCGA-id """ filters = { "op": "in", "content":{ "field": "files.file_name", "value": [idFile] } } params = { "fields": fields, "filters": json.dumps(filters), "format": "TSV", "size": "1" } print("quering...%s"%idFile) response = rq.get(cases_endpt, params = params) #print(response.content.decode('utf-8')) r = response.content.decode("utf-8").split('\r') data = np.array(r[1].replace('\n','').split('\t')) data = data.reshape(1,len(data)) return pd.DataFrame(data=data, columns=r[0].split('\t'), index=[0])
def makePie(df, level, c, whatToLookFor = ['disease_type']): fig = plt.figure(figsize=(60,15)) ax = fig.subplots(1, len(whatToLookFor)) for i,lookFor in enumerate(whatToLookFor): try: datatotestarr = df[lookFor].values except: datatotestarr = df[lookFor] utype, counts = np.unique(datatotestarr, return_counts=True) total = len(datatotestarr) try: labels = ['\n'.join(wrap(str(l), 20)) for l in utype] except: labels = utype ax[i].set_title(lookFor, fontsize=44) patches, texts, autotexts = ax[i].pie(counts, labels=labels, autopct=lambda p: '#:%.0f'%(p * total / 100), textprops={'fontsize':30, 'color':'white', 'wrap':True}) for t in texts: t.set_fontsize(24) t.set_wrap(True) t.set_color('black') fig.savefig("cluster_pie_level_%d_cluster_%d.png"%(level, c)) def makeTopicPie(df, level, whatToLookFor = ['disease_type']): fig = plt.figure(figsize=(60,15)) ax = fig.subplots(1, len(whatToLookFor)) for i,lookFor in enumerate(whatToLookFor): datatotestarr = df[lookFor].values utype, counts = np.unique(datatotestarr, return_counts=True) total = len(datatotestarr) try: labels = ['\n'.join(wrap(str(l), 20)) for l in utype] except: labels = utype ax[i].set_title(lookFor, fontsize=44) patches, texts, autotexts = ax[i].pie(counts, labels=labels, autopct=lambda p: '#:%.0f'%(p * total / 100), textprops={'fontsize':30, 'color':'white', 'wrap':True}) for t in texts: t.set_fontsize(24) t.set_wrap(True) t.set_color('black') fig.savefig("topic_pie_level_%d.png"%level)
[docs]def queryFiles(files): """ Get infor for a list of files :param list files: list of TCGA-ids """ df = pd.DataFrame(columns=fields.split(',')) for i,f in enumerate(files): df = df.append(queryFile(f), ignore_index=True, sort=True) #print(df.head()) return df
[docs]def get_tcga_tissue(sample): """ Get primary_site of tcga sample :param str sample: sample id """ samples = pd.read_csv("/Users/filippo/Developer/tesi/results/fpkm_all/files.dat", index_col=[0], header=0) for fullsample in samples.index.values: if sample in fullsample: return samples.loc[fullsample,:]