Source code for topicpy.TCGA_files.TCGA_files

#  Copyright (c) 2020 fvalle
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.

import requests as rq
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

cases_endpt = 'https://api.gdc.cancer.gov/cases'

fields = [
    "submitter_id",
    "case_id",
    "primary_site",
    "disease_type",
    "project.project_id",
    "diagnoses.tumor_stage",
    "diagnoses.tumor_grade",
    "diagnoses.primary_diagnosis",
    "diagnoses.classification_of_tumor",
    "annotations.classification",
    "samples.tumor_code",
    "samples.tumor_descriptor",
    "annotations.case_id",
    ]

fields = ','.join(fields)

[docs]def queryFile(idFile):
    """
    Get information for file
    
    :param str idFile: file TCGA-id
    """
    filters = {
    "op": "in",
    "content":{
        "field": "files.file_name",
        "value": [idFile]
        }
    }
    params = {
    "fields": fields,
    "filters": json.dumps(filters),
    "format": "TSV",
    "size": "1"
    }
    print("quering...%s"%idFile)
    response = rq.get(cases_endpt, params = params)
    #print(response.content.decode('utf-8'))
    r = response.content.decode("utf-8").split('\r')
    data = np.array(r[1].replace('\n','').split('\t'))
    data = data.reshape(1,len(data))
    return pd.DataFrame(data=data, columns=r[0].split('\t'), index=[0])

def makePie(df, level, c, whatToLookFor = ['disease_type']):
    fig = plt.figure(figsize=(60,15))
    ax = fig.subplots(1, len(whatToLookFor))
    for i,lookFor in enumerate(whatToLookFor):
        try:
            datatotestarr = df[lookFor].values
        except:
            datatotestarr = df[lookFor]
        utype, counts = np.unique(datatotestarr, return_counts=True)
        total = len(datatotestarr)
        try:
            labels = ['\n'.join(wrap(str(l), 20)) for l in utype]
        except:
            labels = utype
        ax[i].set_title(lookFor, fontsize=44)
        patches, texts, autotexts = ax[i].pie(counts,
                                              labels=labels,
                                              autopct=lambda p: '#:%.0f'%(p * total / 100),
                                              textprops={'fontsize':30, 'color':'white', 'wrap':True})
        for t in texts:
            t.set_fontsize(24)
            t.set_wrap(True)
            t.set_color('black')
    fig.savefig("cluster_pie_level_%d_cluster_%d.png"%(level, c))

def makeTopicPie(df, level, whatToLookFor = ['disease_type']):
    fig = plt.figure(figsize=(60,15))
    ax = fig.subplots(1, len(whatToLookFor))
    for i,lookFor in enumerate(whatToLookFor):
        datatotestarr = df[lookFor].values
        utype, counts = np.unique(datatotestarr, return_counts=True)
        total = len(datatotestarr)
        try:
            labels = ['\n'.join(wrap(str(l), 20)) for l in utype]
        except:
            labels = utype
        ax[i].set_title(lookFor, fontsize=44)
        patches, texts, autotexts = ax[i].pie(counts,
                                              labels=labels,
                                              autopct=lambda p: '#:%.0f'%(p * total / 100),
                                              textprops={'fontsize':30, 'color':'white', 'wrap':True})
        for t in texts:
            t.set_fontsize(24)
            t.set_wrap(True)
            t.set_color('black')
    fig.savefig("topic_pie_level_%d.png"%level)

[docs]def queryFiles(files):
    """
    Get infor for a list of files
    
    :param list files: list of TCGA-ids
    """
    df = pd.DataFrame(columns=fields.split(','))
    for i,f  in enumerate(files):
        df = df.append(queryFile(f), ignore_index=True, sort=True)
    #print(df.head())
    return df


[docs]def get_tcga_tissue(sample):
    """
    Get primary_site of tcga sample
    
    :param str sample: sample id
    """
    samples = pd.read_csv("/Users/filippo/Developer/tesi/results/fpkm_all/files.dat", index_col=[0], header=0)
    for fullsample in samples.index.values:
        if sample in fullsample:
            return samples.loc[fullsample,:]