Gene disease patterns

Workflow to find patterns associated to the progressive stages of a disease

The workflow consists on two main steps:

  1. GAMMA

  2. SOM

Step 0: Read expression matrix and the disease stages

Example reading a table downloaded from GEO (i.e. Alzheimer’s dataset)

GSE28146 <- read.table("GSE28146_series_matrix.txt", skip = 69, nrow=54746-69-1, stringsAsFactors=FALSE)
GSE28146 <- as.matrix(GSE28146) 
colnames(GSE28146) <- GSE28146[1,]
rownames(GSE28146) <- GSE28146[,1]
GSE28146 <- GSE28146[,-1]
GSE28146 <- GSE28146[-1,]
mode(GSE28146) <- "numeric"
dim(GSE28146)
## [1] 54675    30

Optional: map the probesets to genes.

(In the original analysis, the probesets were mapped to genes AFTER calculating Gamma. Here, the expression matrix rows are renamed for simplicity in the workflow, but beware of duplicated row names)

library(hgu133plus2.db)
gSymbols <- select(hgu133plus2.db, keys=rownames(GSE28146), columns="SYMBOL", keytype="PROBEID")
## Warning in .generateExtraRows(tab, keys, jointype): 'select' resulted in
## 1:many mapping between keys and return rows
gSymbolsUnique <- gSymbols[!is.na(gSymbols[,2]),] 
gSymbolsUnique <- gSymbolsUnique[table(gSymbolsUnique[,1])==1,] 
gSymbolsUnique <- sort(setNames(gSymbolsUnique[,2], gSymbolsUnique[,1]))
missing <- rownames(GSE28146)[!rownames(GSE28146) %in% names(gSymbolsUnique)]
GSE28146 <- GSE28146[names(gSymbolsUnique),]
rownames(GSE28146) <- unname(gSymbolsUnique)
dim(GSE28146)
## [1] 44979    30
GSE28146[100:105,1:6]
##        GSM697308 GSM697309 GSM697310 GSM697311 GSM697312 GSM697313
## ABCC10     499.1     268.7     274.7     731.8      66.8     195.8
## ABCC11     252.6      49.5      52.3      27.7      56.5     151.6
## ABCC11      35.1      25.4      51.0      77.8      39.8      14.1
## ABCC12     163.7      16.9     109.7     235.2     479.1     124.1
## ABCC13     828.1      64.2      79.6     694.3      16.1     300.2
## ABCC13     135.9      74.7     183.4      23.9     248.4       5.7
GSE28146_pData <- readLines("GSE28146_series_matrix.txt")[33:34]
GSE28146_pData <- do.call(cbind, unname(sapply(gsub("\"","", GSE28146_pData), function(x) strsplit(x, '\t'))))
colnames(GSE28146_pData) <- c("SampleTitle", "GeoID")
GSE28146_pData <- cbind(GSE28146_pData[-1,], Stages=sapply(GSE28146_pData[-1,1], function(x) strsplit(as.character(x), " ", fixed=TRUE)[[1]][1]))
rownames(GSE28146_pData) <- GSE28146_pData[,"GeoID"]
head(GSE28146_pData)
##           SampleTitle    GeoID       Stages   
## GSM697308 "Control 976"  "GSM697308" "Control"
## GSM697309 "Control 1003" "GSM697309" "Control"
## GSM697310 "Control 1008" "GSM697310" "Control"
## GSM697311 "Control 1012" "GSM697311" "Control"
## GSM697312 "Control 1015" "GSM697312" "Control"
## GSM697313 "Control 1018" "GSM697313" "Control"

Step 1: GAMMA

Calculate “GAMMA rank correlation” to identify genes whose expression correlates with a categorical variable that represents the stages of the disease.

Set a value for exprMatrix and stages. This example is run only on 50 probesets/genes:

exprMatrix <- GSE28146[c(1:45, 9114, 17394, 18767, 31423, 39619),]
dim(exprMatrix)
## [1] 50 30

(Make sure the levels of the factor is correct)

stages <- factor(GSE28146_pData[colnames(exprMatrix),"Stages"])
table(stages)
## stages
##   Control Incipient  Moderate    Severe 
##         8         7         8         7
# cbind(stages, as.numeric(stages))

Calculate Gamma correlation. It will take several minutes/hours depending on the computer and dataset, but can be easily parallelized.

library(rococo)
system.time(gammaResult <- apply(exprMatrix, 1, function(geneExpr) rococo.test(as.numeric(stages), geneExpr)))
##    user  system elapsed 
##   0.754   0.000   0.754
save(gammaResult, file="resultGAMMA.RData")

This output contains the main correlation information. It can be transformed into a table and formatted to suit the user needs. In this example we extract the Gamma value and the estimated p-value. Then the p-value is adjusted for multiple testing, and the final table sorted decreasingly by absolute value of Gamma:

gammaTable <- t(sapply(gammaResult, function(x) c(gamma=x@sample.gamma, p.value=x@p.value)))
gammaTable <- cbind(gammaTable, pAdj=p.adjust(p=gammaTable[,"p.value"], method = "fdr", n=nrow(gammaTable))) 
gammaTable <- gammaTable[order(abs(gammaTable[,"gamma"]), decreasing=T),] # Sort
significantGamma <- rownames(gammaTable)[which(gammaTable[,"pAdj"]<0.05)]
par(mfrow=c(2,2))
for(gene in significantGamma[1:4]) boxplot(split(exprMatrix[gene,], stages[colnames(exprMatrix)]), main=gene)

Step 2: SOM

Apply “Self Organizing Map” (SOM) to cluster the genes according to their progressive profiles and identify specific patterns.

Normalize rows

exprsNorm <- t(apply(exprMatrix[significantGamma,, drop=FALSE], 1, function(g) (g - mean(g))/sd(g)))
exprsNorm[1:5,1:5]
##          GSM697308  GSM697309  GSM697310  GSM697311  GSM697312
## DEFB125 -0.7438202 -0.5916006 -0.6191576 -0.6650860 -0.5391111
## IL1B    -0.7720754  0.9401308  0.8371785  1.8057295  1.0041011
## KIF1B   -1.3320185 -1.1406926 -0.3352262 -2.3645460 -1.6480489
## PTPN13  -1.2253220  0.4706501 -0.7451533 -1.7777167 -1.3905054
## TMC7    -0.4019777 -0.8889182 -0.4411454 -0.6087697 -0.9619427

Sort genes increasingly/decreasingly (according to Gamma)

gSplit <- split(significantGamma, gammaTable[significantGamma,"gamma"]>0)

exprUp <- t(apply(exprsNorm[gSplit$"TRUE", , drop=FALSE], 1, sort))
exprDown <- t(apply(exprsNorm[gSplit$"FALSE", , drop=FALSE], 1, function(x) sort(x, decreasing=TRUE)))
exprMerged <- rbind(exprUp, exprDown)

Run SOM. In the paper examples we use a 3x3 rectangular grid.

library(kohonen)
set.seed(2)
somOutput <- som(exprMerged, somgrid(xdim=3, ydim=3, topo = "rectangular"))
save(somOutput, file="resultSOM.RData")

Explore the SOM output (i.e. patterns and number of genes per pattern)

load("resultSOM.RData")
par(mfrow = c(2, 2))
plot(somOutput)
som.hc <- cutree(hclust(dist(somOutput$codes)), 5)
add.cluster.boundaries(somOutput, som.hc)
plot(somOutput, type="count")
plot(somOutput, type="quality")

patterns <- cbind(gen=rownames(somOutput$data), group=somOutput$unit.classif)
patterns <- split(patterns[,1], patterns[,2])
sapply(patterns, length)
##  1  3  4  6  7  9 
## 21 46 38 32 12 14

Assign name/label/order to the patterns. The SOM cluster numbering starts in the botton-left corner:

patternsNamed <- list(p1=patterns$"7",
                       p3=patterns$"3",
                       p2=patterns$"1",
                       p4=patterns$"9",
                       intUp=patterns$"4",
                       intDw=patterns$"6")

Run using rococo_1.1.2 and kohonen_2.0.18