Source code for allensdk.internal.pipeline_modules.gbm.generate_gbm_heatmap

###
# Purpose:
#
#   Generates the heatmap files for GBM analysis runs. The files are the following:
#   transcripts_for_genes.csv, genes_for_transcripts.csv, gene_fpkm_table.csv, transcript_fpkm_table.csv
#
# Usage:
#
#   python generate_gbm_heatmap.py input.json
#
# Input:
#
#   input.json
#
#   {
#       "transcripts_for_genes_output":"/tmp/transcripts_for_genes.csv",
#       "genes_for_transcripts_output":"/tmp/genes_for_transcripts.csv",
#       "gene_fpkm_table_output":"/tmp/gene_fpkm_table.csv",
#       "transcript_fpkm_table_output":"/tmp/transcript_fpkm_table.csv",
#       "columns_samples_output":"/tmp/columns_samples.csv",
#       "analysis_run_records":"/tmp/analysis_run_records.json",
#       "sample_metadata_records":"/tmp/sample_metadata_records.json"
#   }
#
# Output:
#
#   Creates the specified csv files

import json
import sys
import numpy as np
import pandas as pd


[docs]def create_transcripts_for_genes(analysis_run_gene_file): """ Creates a list that contains the associated transcript for each gene sorted by entrez_id """ transcripts_for_genes = np.genfromtxt(analysis_run_gene_file["analysis_run_gene_path"], usecols=[0, 1], skip_header=1, dtype='str').tolist() data = sorted(transcripts_for_genes, key=lambda row: int(row[0])) header = ['gene_id', 'transcript_id(s)'] data.insert(0, header) data = pd.DataFrame(data) return data
[docs]def create_genes_for_transcripts(analysis_run_transcript_file): """ Creates a list that contains the associated gene for each transcript sorted alphabetically """ genes_for_transcripts = np.genfromtxt(analysis_run_transcript_file["analysis_run_transcript_path"], usecols=[0, 1] , skip_header=1, dtype='str').tolist() data = sorted(genes_for_transcripts, key=lambda row: row[0].lower()) header = ['transcript_id', 'gene_id'] data.insert(0, header) data = pd.DataFrame(data) return data
[docs]def create_gene_fpkm_table(analysis_run_records): """ Creates a a matrix ("rows x columns = genes x samples") of fpkm gene expression values for each particular (gene, sample) pair. Rows are sorted by entrez_id and columns are by rna_well_id """ gene_fpkm = [] rna_well_ids = [] for record in analysis_run_records: gene_fpkm.append(np.genfromtxt(record["analysis_run_gene_path"], usecols=[-1] , skip_header=1, dtype='str')) rna_well_ids.append(record["rna_well_id"]) entrez_ids = np.genfromtxt(analysis_run_records[0]["analysis_run_gene_path"], usecols=[0], skip_header=1 , dtype='str').tolist() entrez_ids_int = list(map(int, entrez_ids)) gene_fpkm_table = np.column_stack(gene_fpkm) df = pd.DataFrame(gene_fpkm_table, columns=rna_well_ids, index=entrez_ids_int) df = df.sort_index() rna_well_ids_sorted = sorted(list(map(int, rna_well_ids))) data = df[rna_well_ids_sorted] return data
[docs]def create_transcript_fpkm_table(analysis_run_records): """ Creates a a matrix ("rows x columns = transcripts x samples") of fpkm gene expression values for each particular (transcript, sample) pair. Rows are sorted by transcript id and columns are by rna_well_id """ transcript_fpkm = [] rna_well_ids = [] for record in analysis_run_records: transcript_fpkm.append(np.genfromtxt(record["analysis_run_transcript_path"], usecols=[-1] , skip_header=1, dtype='str')) rna_well_ids.append(record["rna_well_id"]) transcript_ids = np.genfromtxt(analysis_run_records[0]["analysis_run_transcript_path"], usecols=[0], skip_header=1 , dtype='str').tolist() transcript_fpkm_table = np.column_stack(transcript_fpkm) df = pd.DataFrame(transcript_fpkm_table, columns=rna_well_ids, index=transcript_ids) df = df.sort_index() rna_well_ids_sorted = sorted(list(map(int, rna_well_ids))) data = df[rna_well_ids_sorted] return data
[docs]def create_sample_metadata(sample_metadata_records): """ Creates a table of sample metadata sorted by rna_well_id """ df = pd.DataFrame.from_dict(sample_metadata_records, orient='columns') data = df.sort_values(by=['rna_well_id']).reset_index(drop=True) rna_well_id = data['rna_well_id'] data.drop(labels=['rna_well_id'], axis=1, inplace=True) data.insert(0, 'rna_well_id', rna_well_id) return data
[docs]def main(): input_file = sys.argv[1] data = json.load(open(input_file)) transcripts_for_genes_output = data["transcripts_for_genes_output"] genes_for_transcripts_output = data["genes_for_transcripts_output"] gene_fpkm_table_output = data["gene_fpkm_table_output"] transcript_fpkm_table_output = data["transcript_fpkm_table_output"] columns_samples_output = data["columns_samples_output"] analysis_run_records = json.load(open(data["analysis_run_records"])) sample_metadata_records = json.load(open(data["sample_metadata_records"])) create_transcripts_for_genes(analysis_run_records["analysis_run_records"][0]).to_csv(transcripts_for_genes_output, index=False, header=False) create_genes_for_transcripts(analysis_run_records["analysis_run_records"][0]).to_csv(genes_for_transcripts_output, index=False, header=False) create_gene_fpkm_table(analysis_run_records["analysis_run_records"]).to_csv(gene_fpkm_table_output) create_transcript_fpkm_table(analysis_run_records["analysis_run_records"]).to_csv(transcript_fpkm_table_output) create_sample_metadata(sample_metadata_records).to_csv(columns_samples_output, index=False)
if __name__ == '__main__': main()