###
# Purpose:
#
# Generates the heatmap files for GBM analysis runs. The files are the following:
# transcripts_for_genes.csv, genes_for_transcripts.csv, gene_fpkm_table.csv, transcript_fpkm_table.csv
#
# Usage:
#
# python generate_gbm_heatmap.py input.json
#
# Input:
#
# input.json
#
# {
# "transcripts_for_genes_output":"/tmp/transcripts_for_genes.csv",
# "genes_for_transcripts_output":"/tmp/genes_for_transcripts.csv",
# "gene_fpkm_table_output":"/tmp/gene_fpkm_table.csv",
# "transcript_fpkm_table_output":"/tmp/transcript_fpkm_table.csv",
# "columns_samples_output":"/tmp/columns_samples.csv",
# "analysis_run_records":"/tmp/analysis_run_records.json",
# "sample_metadata_records":"/tmp/sample_metadata_records.json"
# }
#
# Output:
#
# Creates the specified csv files
import json
import sys
import numpy as np
import pandas as pd
[docs]def create_transcripts_for_genes(analysis_run_gene_file):
""" Creates a list that contains the associated transcript for each gene sorted by entrez_id """
transcripts_for_genes = np.genfromtxt(analysis_run_gene_file["analysis_run_gene_path"], usecols=[0, 1], skip_header=1,
dtype='str').tolist()
data = sorted(transcripts_for_genes, key=lambda row: int(row[0]))
header = ['gene_id', 'transcript_id(s)']
data.insert(0, header)
data = pd.DataFrame(data)
return data
[docs]def create_genes_for_transcripts(analysis_run_transcript_file):
""" Creates a list that contains the associated gene for each transcript sorted alphabetically """
genes_for_transcripts = np.genfromtxt(analysis_run_transcript_file["analysis_run_transcript_path"], usecols=[0, 1]
, skip_header=1, dtype='str').tolist()
data = sorted(genes_for_transcripts, key=lambda row: row[0].lower())
header = ['transcript_id', 'gene_id']
data.insert(0, header)
data = pd.DataFrame(data)
return data
[docs]def create_gene_fpkm_table(analysis_run_records):
""" Creates a a matrix ("rows x columns = genes x samples") of fpkm gene expression values for each particular
(gene, sample) pair. Rows are sorted by entrez_id and columns are by rna_well_id """
gene_fpkm = []
rna_well_ids = []
for record in analysis_run_records:
gene_fpkm.append(np.genfromtxt(record["analysis_run_gene_path"], usecols=[-1]
, skip_header=1, dtype='str'))
rna_well_ids.append(record["rna_well_id"])
entrez_ids = np.genfromtxt(analysis_run_records[0]["analysis_run_gene_path"], usecols=[0], skip_header=1
, dtype='str').tolist()
entrez_ids_int = list(map(int, entrez_ids))
gene_fpkm_table = np.column_stack(gene_fpkm)
df = pd.DataFrame(gene_fpkm_table, columns=rna_well_ids, index=entrez_ids_int)
df = df.sort_index()
rna_well_ids_sorted = sorted(list(map(int, rna_well_ids)))
data = df[rna_well_ids_sorted]
return data
[docs]def create_transcript_fpkm_table(analysis_run_records):
""" Creates a a matrix ("rows x columns = transcripts x samples") of fpkm gene expression values for each particular
(transcript, sample) pair. Rows are sorted by transcript id and columns are by rna_well_id """
transcript_fpkm = []
rna_well_ids = []
for record in analysis_run_records:
transcript_fpkm.append(np.genfromtxt(record["analysis_run_transcript_path"], usecols=[-1]
, skip_header=1, dtype='str'))
rna_well_ids.append(record["rna_well_id"])
transcript_ids = np.genfromtxt(analysis_run_records[0]["analysis_run_transcript_path"], usecols=[0], skip_header=1
, dtype='str').tolist()
transcript_fpkm_table = np.column_stack(transcript_fpkm)
df = pd.DataFrame(transcript_fpkm_table, columns=rna_well_ids, index=transcript_ids)
df = df.sort_index()
rna_well_ids_sorted = sorted(list(map(int, rna_well_ids)))
data = df[rna_well_ids_sorted]
return data
[docs]def main():
input_file = sys.argv[1]
data = json.load(open(input_file))
transcripts_for_genes_output = data["transcripts_for_genes_output"]
genes_for_transcripts_output = data["genes_for_transcripts_output"]
gene_fpkm_table_output = data["gene_fpkm_table_output"]
transcript_fpkm_table_output = data["transcript_fpkm_table_output"]
columns_samples_output = data["columns_samples_output"]
analysis_run_records = json.load(open(data["analysis_run_records"]))
sample_metadata_records = json.load(open(data["sample_metadata_records"]))
create_transcripts_for_genes(analysis_run_records["analysis_run_records"][0]).to_csv(transcripts_for_genes_output,
index=False, header=False)
create_genes_for_transcripts(analysis_run_records["analysis_run_records"][0]).to_csv(genes_for_transcripts_output,
index=False, header=False)
create_gene_fpkm_table(analysis_run_records["analysis_run_records"]).to_csv(gene_fpkm_table_output)
create_transcript_fpkm_table(analysis_run_records["analysis_run_records"]).to_csv(transcript_fpkm_table_output)
create_sample_metadata(sample_metadata_records).to_csv(columns_samples_output, index=False)
if __name__ == '__main__':
main()