The workflow consists on two main steps:
GAMMA
SOM
Example reading a table downloaded from GEO (i.e. Alzheimer’s dataset)
GSE28146 <- read.table("GSE28146_series_matrix.txt", skip = 69, nrow=54746-69-1, stringsAsFactors=FALSE)
GSE28146 <- as.matrix(GSE28146)
colnames(GSE28146) <- GSE28146[1,]
rownames(GSE28146) <- GSE28146[,1]
GSE28146 <- GSE28146[,-1]
GSE28146 <- GSE28146[-1,]
mode(GSE28146) <- "numeric"
dim(GSE28146)
## [1] 54675 30
Optional: map the probesets to genes.
(In the original analysis, the probesets were mapped to genes AFTER calculating Gamma. Here, the expression matrix rows are renamed for simplicity in the workflow, but beware of duplicated row names)
library(hgu133plus2.db)
gSymbols <- select(hgu133plus2.db, keys=rownames(GSE28146), columns="SYMBOL", keytype="PROBEID")
## Warning in .generateExtraRows(tab, keys, jointype): 'select' resulted in
## 1:many mapping between keys and return rows
gSymbolsUnique <- gSymbols[!is.na(gSymbols[,2]),]
gSymbolsUnique <- gSymbolsUnique[table(gSymbolsUnique[,1])==1,]
gSymbolsUnique <- sort(setNames(gSymbolsUnique[,2], gSymbolsUnique[,1]))
missing <- rownames(GSE28146)[!rownames(GSE28146) %in% names(gSymbolsUnique)]
GSE28146 <- GSE28146[names(gSymbolsUnique),]
rownames(GSE28146) <- unname(gSymbolsUnique)
dim(GSE28146)
## [1] 44979 30
GSE28146[100:105,1:6]
## GSM697308 GSM697309 GSM697310 GSM697311 GSM697312 GSM697313
## ABCC10 499.1 268.7 274.7 731.8 66.8 195.8
## ABCC11 252.6 49.5 52.3 27.7 56.5 151.6
## ABCC11 35.1 25.4 51.0 77.8 39.8 14.1
## ABCC12 163.7 16.9 109.7 235.2 479.1 124.1
## ABCC13 828.1 64.2 79.6 694.3 16.1 300.2
## ABCC13 135.9 74.7 183.4 23.9 248.4 5.7
GSE28146_pData <- readLines("GSE28146_series_matrix.txt")[33:34]
GSE28146_pData <- do.call(cbind, unname(sapply(gsub("\"","", GSE28146_pData), function(x) strsplit(x, '\t'))))
colnames(GSE28146_pData) <- c("SampleTitle", "GeoID")
GSE28146_pData <- cbind(GSE28146_pData[-1,], Stages=sapply(GSE28146_pData[-1,1], function(x) strsplit(as.character(x), " ", fixed=TRUE)[[1]][1]))
rownames(GSE28146_pData) <- GSE28146_pData[,"GeoID"]
head(GSE28146_pData)
## SampleTitle GeoID Stages
## GSM697308 "Control 976" "GSM697308" "Control"
## GSM697309 "Control 1003" "GSM697309" "Control"
## GSM697310 "Control 1008" "GSM697310" "Control"
## GSM697311 "Control 1012" "GSM697311" "Control"
## GSM697312 "Control 1015" "GSM697312" "Control"
## GSM697313 "Control 1018" "GSM697313" "Control"
Set a value for exprMatrix
and stages
. This example is run only on 50 probesets/genes:
exprMatrix <- GSE28146[c(1:45, 9114, 17394, 18767, 31423, 39619),]
dim(exprMatrix)
## [1] 50 30
(Make sure the levels of the factor is correct)
stages <- factor(GSE28146_pData[colnames(exprMatrix),"Stages"])
table(stages)
## stages
## Control Incipient Moderate Severe
## 8 7 8 7
# cbind(stages, as.numeric(stages))
Calculate Gamma correlation. It will take several minutes/hours depending on the computer and dataset, but can be easily parallelized.
library(rococo)
system.time(gammaResult <- apply(exprMatrix, 1, function(geneExpr) rococo.test(as.numeric(stages), geneExpr)))
## user system elapsed
## 0.754 0.000 0.754
save(gammaResult, file="resultGAMMA.RData")
This output contains the main correlation information. It can be transformed into a table and formatted to suit the user needs. In this example we extract the Gamma value and the estimated p-value. Then the p-value is adjusted for multiple testing, and the final table sorted decreasingly by absolute value of Gamma:
gammaTable <- t(sapply(gammaResult, function(x) c(gamma=x@sample.gamma, p.value=x@p.value)))
gammaTable <- cbind(gammaTable, pAdj=p.adjust(p=gammaTable[,"p.value"], method = "fdr", n=nrow(gammaTable)))
gammaTable <- gammaTable[order(abs(gammaTable[,"gamma"]), decreasing=T),] # Sort
significantGamma <- rownames(gammaTable)[which(gammaTable[,"pAdj"]<0.05)]
par(mfrow=c(2,2))
for(gene in significantGamma[1:4]) boxplot(split(exprMatrix[gene,], stages[colnames(exprMatrix)]), main=gene)
Normalize rows
exprsNorm <- t(apply(exprMatrix[significantGamma,, drop=FALSE], 1, function(g) (g - mean(g))/sd(g)))
exprsNorm[1:5,1:5]
## GSM697308 GSM697309 GSM697310 GSM697311 GSM697312
## DEFB125 -0.7438202 -0.5916006 -0.6191576 -0.6650860 -0.5391111
## IL1B -0.7720754 0.9401308 0.8371785 1.8057295 1.0041011
## KIF1B -1.3320185 -1.1406926 -0.3352262 -2.3645460 -1.6480489
## PTPN13 -1.2253220 0.4706501 -0.7451533 -1.7777167 -1.3905054
## TMC7 -0.4019777 -0.8889182 -0.4411454 -0.6087697 -0.9619427
Sort genes increasingly/decreasingly (according to Gamma)
gSplit <- split(significantGamma, gammaTable[significantGamma,"gamma"]>0)
exprUp <- t(apply(exprsNorm[gSplit$"TRUE", , drop=FALSE], 1, sort))
exprDown <- t(apply(exprsNorm[gSplit$"FALSE", , drop=FALSE], 1, function(x) sort(x, decreasing=TRUE)))
exprMerged <- rbind(exprUp, exprDown)
Run SOM. In the paper examples we use a 3x3 rectangular grid.
library(kohonen)
set.seed(2)
somOutput <- som(exprMerged, somgrid(xdim=3, ydim=3, topo = "rectangular"))
save(somOutput, file="resultSOM.RData")
Explore the SOM output (i.e. patterns and number of genes per pattern)
load("resultSOM.RData")
par(mfrow = c(2, 2))
plot(somOutput)
som.hc <- cutree(hclust(dist(somOutput$codes)), 5)
add.cluster.boundaries(somOutput, som.hc)
plot(somOutput, type="count")
plot(somOutput, type="quality")
patterns <- cbind(gen=rownames(somOutput$data), group=somOutput$unit.classif)
patterns <- split(patterns[,1], patterns[,2])
sapply(patterns, length)
## 1 3 4 6 7 9
## 21 46 38 32 12 14
Assign name/label/order to the patterns. The SOM cluster numbering starts in the botton-left corner:
patternsNamed <- list(p1=patterns$"7",
p3=patterns$"3",
p2=patterns$"1",
p4=patterns$"9",
intUp=patterns$"4",
intDw=patterns$"6")
Run using rococo_1.1.2 and kohonen_2.0.18