############################################################################################################### # # # A collection of scripts for pre-processing data prior to running Gene Set Enrichment Analysis (GSEA) and # # converting the results into a representation for Latent Dirichlet Allocation (LDA). # # The functions here are intended to be used with the Broad Institute's GSEA software package, available from # # # # http://www.broadinstitute.org/gsea/ , # # # # along with M. Steyvers and T. Griffiths Topic Modeling Toolbox, available from # # # # http://psiexp.ss.uci.edu/research/programs_data/toolbox.htm . # # # # For questions or troubleshooting, please contact José Caldas at jose.caldas@tkk.fi . # # # ############################################################################################################### ################################ # # # Preparing the input for GSEA.# # # ################################ # # Create a formatted expression file from an expression matrix. # INPUT # - exprMat: A genes (rows) * conditions (columns) expression matrix. The matrix is assumed to # contain gene labels as its rows names and condition labels as its column names. # - gctFn: File to write the results to. # gsea.write.gct <- function(exprMat, gctFn){ nGenes = nrow(exprMat) nConds = ncol(exprMat) write("#1.2", file = gctFn, append = F) # All .gct files require this dummy header line. write(paste(nGenes, nConds, sep = "\t"), file = gctFn, append = T) write(paste("Name", "Description", paste(colnames(exprMat), collapse = "\t"), sep = "\t"), file = gctFn, append = T) # The second column of the .gct file, "Description", is filled out with "na"'s. rownames(exprMat) = paste(rownames(exprMat), "na", sep = "\t") # Append "\tna" to every gene name. write.table(exprMat, file = gctFn, append = T, quote = F, sep = "\t", na = "", row.names = T, col.names = F) } # # Create a phenotype label file. # INPUT # - classLabels: An array of single-character labels. There should be 2 and only 2 unique labels. # - clsFn: Output file. # gsea.write.cls <- function(classLabels, clsFn){ nConds = length(classLabels) write(paste(nConds, "2", "1", sep = " "), file = clsFn, append = F) # The dummy "2 1" is always required. uniqueLabels = unique(classLabels) firstLabel = classLabels[1] # The second line in the file indicates what labels are there. The first label in this line should be # the same label as the first label in the third line. Otherwise, GSEA will write report files that use one # label to refer to the other one. This is a messy property of the format of .cls files. # The if-else below deals with that. Of course, this problem is avoided if the second and third lines # use disparate labelling symbols for the phenotypes (e.g. line 2 uses "A B" while line 3 uses "0 1"). # However, this function does not give the option to use different symbols. if(uniqueLabels[1] == firstLabel) write(paste("#", uniqueLabels[1], uniqueLabels[2], sep = " "), file = clsFn, append = T) else write(paste("#", uniqueLabels[2], uniqueLabels[1], sep = " "), file = clsFn, append = T) write(classLabels, file = clsFn, append = T, sep = " ", ncolumns = nConds) } # # Write a script file to call java GSEA with a default parameterization. # Assumes probesets have already been converted to HUGO gene symbols. # INPUT # - shFn: Output script filename. # - gseaJar: GSEA Jar filename. # - gctFn: gct filename. See the function 'gsea.write.gct'. # - clsFn: cls filename. See the function 'gsea.write.cls'. # - gmtFn: Gene set filename, downloadable from the MSigDb web page. # - studyDir: Directory to store the results. # - studyLabel: Study label used for naming reports. # gsea.write.sh.script <- function(shFn, gseaJar, gctFn, clsFn, gmtFn, studyDir, studyLabel) { str = paste("java -cp ", gseaJar, " -Xmx2048m xtools.gsea.Gsea ", sep = "") str = paste(str, "-res ", gctFn, " ", sep = "") str = paste(str, "-cls ", clsFn, " ", sep = "") str = paste(str, "-gmx ", gmtFn, " ", sep = "") str = paste(str, "-out ", studyDir, " ", sep = "") str = paste(str, "-rpt_label ", studyLabel, " ", sep = "") str = paste(str, "-norm meandiv -nperm 1000 -scoring_scheme weighted ") str = paste(str, "-collapse false -mode Max_probe ") str = paste(str, "-permute phenotype -rnd_type no_balance ") str = paste(str, "-metric Signal2Noise -sort real -order descending -make_sets true ") str = paste(str, "-plot_top_x 50 -rnd_seed timestamp -set_max 500 -set_min 15 -zip_report false -gui false ") write(str, file = shFn, append = F) system(paste('chmod +x', shFn, sep = " ")) } # # Write a script file to call java GSEA with pre-ranked data and a default parameterization. # Assumes probesets have already been converted to HUGO gene symbols. # Permutation has to be gene-wise (information about the conditions has already been lost). # INPUT # - shFn: Output script filename. # - gseaJar: GSEA Jar filename. # - gmtFn: Gene set filename, downloadable from the MSigDb web page. # - rnkFn: File with rankings. Is tab-delimited, has two columns (\t), # and includes a header. # - studyDir: Directory to store the results. # - studyLabel: Study label used for naming reports. # gsea.preranked.write.sh.script <- function(shFn, gseaJar, gmtFn, rnkFn, studyDir, studyLabel) { # Setting virtual memory usage to 500m, because otherwise java crashes. str = paste("java -cp ", gseaJar, " -Xmx2048m xtools.gsea.GseaPreranked ", sep = "") str = paste(str, "-gmx ", gmtFn, " ", sep = "") str = paste(str, "-rnk ", rnkFn, " ", sep = "") str = paste(str, "-rpt_label ", studyLabel, " ", sep = "") str = paste(str, "-out ", studyDir, " ", sep = "") str = paste(str, "-collapse false -permute gene_set", " ", sep = "") # Don't use a chip file and randomize gene sets. str = paste(str, "-norm meandiv -nperm 1000 -scoring_scheme weighted", " ", sep = "") str = paste(str, "-make_sets true -plot_top_x 50 -rnd_seed timestamp", " ", sep = "") str = paste(str, "-set_max 500 -set_min 15 -zip_report false -gui false") write(str, file = shFn, append = F) system(paste('chmod +x', shFn, sep = " ")) } #################################### # # # Processing the output from GSEA. # # # #################################### # # Write a summary file for a GSEA run. # Each line in the tab-delimited summary file will contain four columns: gene set label, +-1 # indicating over or under-expression of the gene set in the phenotype mentioned in labels[1], # genes in the leading edge subset (comma-separated), # and genes not in the leading edge subset but probed in the array platform (comma-separated). # INPUT # - reportDir: Directory where the GSEA results are stored. # - label: A 2-element array with the phenotype labels. # - nGeneSets: Top-scoring gene sets to keep (w.r.t. absolute value of normalized enrichment score (NES)). # - summaryFn: Filename for the output summary file. # OUTPUT: # True if everything went well. False if at least one of the report files could not be found. # gsea.write.summary.file <- function(reportDir, labels, nGeneSets, summaryFn) { report_l1 = system(paste("ls ", reportDir, "gsea_report_for_", labels[1], "_*.xls", sep = ""), intern = T)[1] report_l2 = system(paste("ls ", reportDir, "gsea_report_for_", labels[2], "_*.xls", sep = ""), intern = T)[1] if(is.na(report_l1) || is.na(report_l2)){ return(F) # This means that the report files could not be found. } report = rbind(read.table(report_l1, header = T, sep = "\t"), read.table(report_l2, header = T, sep = "\t")) report = report[, c("NAME", "NES")] # We are only interested in the "name" and "nes" fields. report = report[order(abs(report[,"NES"]), decreasing = T),] # Order data frame in decreasing order of abs(NES). report = report[1:min(nGeneSets, nrow(report)),] # Only the top-scoring gene sets are kept. write(paste("Gene set\tOver-expression in phenotype ", labels[1], "\tGenes in LES\tOther genes", sep = ""), file = summaryFn, append = F) for(i in 1:nrow(report)){ gs = report[i, "NAME"] # Gene set label. # 'reportFn' is a detailed report file for the current gene set. reportFn = system(paste("ls ", reportDir, gs, ".xls", sep = ""), intern = T, ignore.stderr = T) if(length(reportFn) == 0){ # Couldn't find report file. print(paste("Can't find report file for ", gs, " in ", reportDir, sep = "")) next } if(report[i, "NES"] >= 0) sign = "1" # This means the gene set was over-expressed in phenotype A. else sign = "-1" # This means the gene set was under-expressed in phenotype A. detailedReport = read.table(reportFn, header = T, sep = "\t")[,c("PROBE","CORE.ENRICHMENT")] coreGenes = paste(as.character(detailedReport[which(detailedReport[,"CORE.ENRICHMENT"] == "Yes"), "PROBE"]), collapse = ",") otherGenes = paste(as.character(detailedReport[which(detailedReport[,"CORE.ENRICHMENT"] == "No"), "PROBE"]), collapse = ",") write(paste(gs, sign, coreGenes, otherGenes, sep = "\t"), file = summaryFn, append = T) } return(T) } ############################################################# # # # Converting summary files to an LDA-amenable input format. # # # ############################################################# # # Write a tab-delimited file, where the first column is a gene set label and # the second column is an integer. Indexing starts at 1, not 0. # Gene sets that do not appear in any summary file do not receive an id. # INPUT # - outputFn: File to be written. Contents are overwritten. # - summaryDir: Directory where all summary files are kept. # write.gene.set.id.file <- function(outputFn, summaryDir) { geneSets = c() summaries = system(paste('ls ', summaryDir), intern = T) for(summaryFn in summaries){ lines = readLines(paste(summaryDir, summaryFn, sep = "")) lines = lines[2:length(lines)] # Ignore header line. for(line in lines){ # For each differential gene set. line = strsplit(line, split = "\t") geneSetLabel = line[[1]][1] if(! (geneSetLabel %in% geneSets)) geneSets = c(geneSets, geneSetLabel) } } # Indexing starts at 1, not 0 (see "id" variable below). df = data.frame(geneSet = geneSets, id = 1:length(geneSets)) write("Gene set\tId", file = outputFn, append = F) write.table(df, file = outputFn, append = T, quote = F, sep = "\t", row.names = F, col.names = F) } # # Write a two-column tab-delimited file, where the 1st column is a study label and the 2nd column is an integer. # Indexing starts at 1, not 0. # The study label will be the same as the summary filename (apart from the .txt extension). # INPUT # - outputFn: Filename for output tab-delimited file. # - summaryDir: Directory where all summary files are stored. # write.study.id.file <- function(outputFn, summaryDir) { write("Study\tId", file = outputFn, append = F) summaries = system(paste('ls ', summaryDir), intern = T) # Summary filenames. id = 1 # Indexing starts at 1, not 0. for(summaryFn in summaries){ studyLabel = substr(summaryFn, 1, nchar(summaryFn) - 4) write(paste(studyLabel, id, sep = "\t"), file = outputFn, append = T) id = id + 1 } } # # Convert the summary files to a representation suitable for the LDA implementation of the Topic Modeling Toolbox. # Recall that in the current application of LDA, GSEA runs are regarded as documents, while gene sets # are regarded as words taken from a common vocabulary. The number of occurences of a word in a document amounts # to the number of genes in the leading edge subset of the corresponding gene set in that GSEA run. # Notice that the sign of differential expression is ignored. # Caveat: The document and word indices start at 1, not 0. # INPUT # - geneSetIdFn: Filename for tab-delimited file mapping gene set labels to unique identifiers. # See the function 'write.gene.set.id.file'. # - studyIdFn: Filename for tab-delimited file mapping study id labels to unique integer identifiers. # See the function 'write.study.id.file'. # - summaryDir: Directory where all summary files are stored. See the function 'gsea.write.summary.file'. # - ldaWordFn: Filename for the output file that will contain "word" indices, one token per line. # - ldaDocFn: Filename for the output file that will contain "document" indices, one token per line. # write.LDA.input.files <- function(geneSetIdFn, studyIdFn, summaryDir, ldaWordFn, ldaDocFn) { geneSetDf = read.delim(geneSetIdFn, row.names = 1) # Gene set labels are row names. Ignore header by default. studyDf = read.delim(studyIdFn, row.names = 1) # Study labels are row names. Ignore header by default. summaries = system(paste('ls', summaryDir, sep = " "), intern = T) # Summary filenames. unlink(ldaWordFn) unlink(ldaDocFn) for(summaryFn in summaries){ studyLabel = substr(summaryFn, 1, nchar(summaryFn) - 4) # Discard ".txt" extension. docId = studyDf[studyLabel, 1] # Integer id for current GSEA run / document. lines = readLines(paste(summaryDir, summaryFn, sep = "")) # Read current summary file. lines = lines[2:length(lines)] # Ignore header line. for(line in lines){ # For each differential gene set. line = strsplit(line, split = "\t") geneSetLabel = line[[1]][1] coreGenes = line[[1]][3] nCoreGenes = length(strsplit(coreGenes, split = ",")[[1]]) geneSetId = geneSetDf[geneSetLabel, 1] wordStr = as.character(rep(geneSetId, nCoreGenes)) docStr = as.character(rep(docId, nCoreGenes)) write(paste(wordStr, collapse = "\n"), file = ldaWordFn, append = T) write(paste(docStr, collapse = "\n"), file = ldaDocFn, append = T) } } }