########################################################################################## ## ## ## R-scripts for the manuscript ## ## ## ## Comparison of scores for bimodality of gene expression ## ## distributions and genome-wide evaluation of the prognostic ## ## relevance of high-scoring genes ## ## ## ## by ## ## Birte Hellwig, Jan G. Hengstler, Marcus Schmidt, ## ## Mathias C. Gehrmann, Wiebke Schormann and Jörg Rahnenführer ## ## ## ########################################################################################## # bimodality.scores returns a list containing the values of the bimodality scores VRS, WVRS and # the dip statistic, the partitioning obtained by k-means is also given # expr.values = a vector or matrix containing gene expression values, if a matrix the scores are # calculated for each row of the matrix library(diptest) bimodality.scores <- function(expr.values){ if(is.null(dim(expr.values))){ dip.stat <- dip(expr.values) n <- length(expr.values) # k-means with minimum and maximum observed expression value as initial centers cl.res <- kmeans(expr.values, 2, centers=c(min(expr.values), max(expr.values))) # calculating VRS and WVRS vrs <- (sum(cl.res$withinss) / (n-1)) / var(expr.values) wvrs <- (sum(cl.res$withinss / cl.res$size) / 2) / ( ((n-1)/n) * var(expr.values)) groups <- cl.res$cluster } else{ n <- ncol(expr.values) ngenes <- nrow(expr.values) dip.stat <- NULL vrs <- NULL wvrs <- NULL groups <- matrix(ncol=n, nrow=ngenes) for(i in 1:ngenes){ vec <- expr.values[i,] cl.res <- kmeans(vec, 2, centers=c(min(vec), max(vec))) dip.stat[i] <- dip(vec) vrs[i] <- (sum(cl.res$withinss) / (n-1)) / var(vec) wvrs[i] <- (sum(cl.res$withinss / cl.res$size) / 2) / ( ((n-1)/n) * var(vec)) groups[i,] <- cl.res$cluster} rownames(groups) <- rownames(expr.values) colnames(groups) <- colnames(expr.values)} invisible(list(VRS=vrs, WVRS=wvrs, dip=dip.stat, cluster=groups))} # number of patients np <- ncol(expr.values) # number of genes ng <- nrow(expr.values) ## Kurtosis ## library(e1071) K <- apply(expr.values, 1, kurtosis, type=2) ## Likelihood Ratio ## # Model-based clustering -> Likelihood Ratio library(mclust) # log Likelihood for 1 cluster l1 <- apply(expr.values, 1, function(x) Mclust(x, G = 1)$loglik) # log Likelihood for 2 cluster l2 <- numeric(length = ng) # matrix containing the clustering results VMCL <- matrix(nrow = ng, ncol = np) for(i in 1:ng){ # fitting a mixed model with unequal variances res <- Mclust(expr.values[i,], G = 2, modelNames = "V") # if there is just one patient in the outlier group, a model with equal variances is fitted if(is.na(res$BIC)){ res2 <- Mclust(expr.values[i,], G = 2, modelNames = "E") l2[i] <- res2$loglik VMCL[i,] <- res2$classification} else{ l2[i] <- res$loglik VMCL[i,] <- res$classification}} # Likelihood Ratio LR <- exp(l2) / exp(l1) ## Bimodality Index ## library(ClassDiscovery) BI <- bimodalIndex(expr.values)$BI # Model-based clustering # two cluster with equal variances library(mclust) EMCL <- t(apply(expr.values, 1, function(x) Mclust(x, G = 2, modelNames = "E")$c)) ## outlier-sum statistic ## # standardized expression values exprs.mad <- (expr.values - apply(expr.values, 1, median) * matrix(1, nrow = ng, ncol = np)) / (apply(expr.values, 1, mad) * matrix(1, nrow = ng, ncol = np)) # 25th percentile q25 <- apply(exprs.mad, 1, function(x) quantile(x, 0.25)) # 75th percentile q75 <- apply(exprs.mad, 1, function(x) quantile(x, 0.75)) # interquartile range iqr <- q75 - q25 # calculating the outlier-sum statistics for positive outliers (w1) and negative outliers (w2) ind1 <- exprs.mad > q75 + iqr ind2 <- exprs.mad < q25 - iqr s1 <- rowSums(ind1) w1 <- rowSums(ind1 * exprs.mad) s2 <- rowSums(ind2) w2 <- rowSums(ind2 * exprs.mad) # outlier-sum statistic OS <- apply(abs(cbind(w1, w2)), 1, max) dec <- apply(abs(cbind(w1, w2)), 1, which.max) # defining the two groups os.groups <- matrix(ncol = np, nrow = ng) for(i in 1:ng){ if(dec[i]==1){ os.groups[i,] <- ind1[i,] + 1} else{ os.groups[i,] <- abs(ind2[i,] - 2)} } # function for calculating p-values and corresponding q-values of the logrank test # groups = vector or matrix containing group labels, if a matrix the p-value is calculated for each row # time = a vector of survival times # event = the status indicator, 0=alive or right censored, 1=dead library(survival) lr.test <- function(groups, time, event){ np <- length(time) surv <- function(groups){ if(sum(groups)==np | sum(groups)==2*np | sum(is.na(groups))==np){ lr.stat <- NA} else{ lr.stat <- survdiff(Surv(time, event) ~ groups)$chisq}} if(is.null(dim(groups))){ lr.stat <- surv(groups) lr.p <- 1-pchisq(lr.stat, df = 1) lr.q <- lr.p} else{ lr.stat <- apply(groups, 1, surv) lr.p <- 1-pchisq(lr.stat, df = 1) lr.q <- p.adjust(lr.p, "fdr")} invisible(list(p.value = lr.p, q.value = lr.q))}