##########################################################################################
##                                                                                      ##
##                             R-scripts for the manuscript                             ##
##                                                                                      ##
##                  Comparison of scores for bimodality of gene expression              ##
##                distributions and genome-wide evaluation of the prognostic            ##
##                             relevance of high-scoring genes                          ##
##                                                                                      ##
##                                          by                                          ##
##                      Birte Hellwig, Jan G. Hengstler, Marcus Schmidt,                ##
##                 Mathias C. Gehrmann, Wiebke Schormann and Jörg Rahnenführer          ##
##                                                                                      ##
##########################################################################################




# bimodality.scores returns a list containing the values of the bimodality scores VRS, WVRS and 
# the dip statistic, the partitioning obtained by k-means is also given
# expr.values = a vector or matrix containing gene expression values, if a matrix the scores are
# calculated for each row of the matrix


library(diptest)

bimodality.scores <- function(expr.values){


    if(is.null(dim(expr.values))){
        dip.stat <- dip(expr.values)
        n <- length(expr.values)
        # k-means with minimum and maximum observed expression value as initial centers
        cl.res <- kmeans(expr.values, 2, centers=c(min(expr.values), max(expr.values)))

        # calculating VRS and WVRS
        vrs <- (sum(cl.res$withinss) / (n-1)) / var(expr.values)
        wvrs <- (sum(cl.res$withinss / cl.res$size) / 2) / ( ((n-1)/n) * var(expr.values))
        groups <- cl.res$cluster
        }
    else{
        n <- ncol(expr.values)
        ngenes <- nrow(expr.values)
        dip.stat <- NULL
        vrs <- NULL
        wvrs <- NULL
        groups <- matrix(ncol=n, nrow=ngenes)
        for(i in 1:ngenes){
            vec <- expr.values[i,]
            cl.res <- kmeans(vec, 2, centers=c(min(vec), max(vec))) 
            dip.stat[i] <- dip(vec)
            vrs[i] <- (sum(cl.res$withinss) / (n-1)) / var(vec)
            wvrs[i] <- (sum(cl.res$withinss / cl.res$size) / 2) / ( ((n-1)/n) * var(vec))
            groups[i,] <- cl.res$cluster}
            rownames(groups) <- rownames(expr.values)
            colnames(groups) <- colnames(expr.values)}
    invisible(list(VRS=vrs, WVRS=wvrs, dip=dip.stat, cluster=groups))}




# number of patients
np <- ncol(expr.values)

# number of genes
ng <- nrow(expr.values)


## Kurtosis ##

library(e1071)
K <- apply(expr.values, 1, kurtosis, type=2)


## Likelihood Ratio ##

# Model-based clustering -> Likelihood Ratio

library(mclust)

# log Likelihood for 1 cluster
l1 <- apply(expr.values, 1, function(x) Mclust(x, G = 1)$loglik)

# log Likelihood for 2 cluster
l2 <- numeric(length = ng)

# matrix containing the clustering results
VMCL <- matrix(nrow = ng, ncol = np)

for(i in 1:ng){
    # fitting a mixed model with unequal variances
    res <- Mclust(expr.values[i,], G = 2, modelNames = "V")
    # if there is just one patient in the outlier group, a model with equal variances is fitted
    if(is.na(res$BIC)){
        res2 <- Mclust(expr.values[i,], G = 2, modelNames = "E")
        l2[i] <- res2$loglik
        VMCL[i,] <- res2$classification}
    else{
        l2[i] <- res$loglik
        VMCL[i,] <- res$classification}}

    
# Likelihood Ratio
LR <- exp(l2) / exp(l1)


## Bimodality Index ##

library(ClassDiscovery)
BI <- bimodalIndex(expr.values)$BI

# Model-based clustering
# two cluster with equal variances

library(mclust)
        
EMCL <- t(apply(expr.values, 1, function(x) Mclust(x, G = 2, modelNames = "E")$c))



## outlier-sum statistic ##

# standardized expression values

exprs.mad <- (expr.values - apply(expr.values, 1, median) * matrix(1, nrow = ng, ncol = np)) / 
(apply(expr.values, 1, mad) * matrix(1, nrow = ng, ncol = np))

# 25th percentile
q25 <- apply(exprs.mad, 1, function(x) quantile(x, 0.25))

# 75th percentile
q75 <- apply(exprs.mad, 1, function(x) quantile(x, 0.75))

# interquartile range
iqr <- q75 - q25

# calculating the outlier-sum statistics for positive outliers (w1) and negative outliers (w2)
ind1 <- exprs.mad > q75 + iqr
ind2 <- exprs.mad < q25 - iqr
s1 <- rowSums(ind1)
w1 <- rowSums(ind1 * exprs.mad)
s2 <- rowSums(ind2)
w2 <- rowSums(ind2 * exprs.mad)

# outlier-sum statistic
OS <- apply(abs(cbind(w1, w2)), 1, max)
dec <- apply(abs(cbind(w1, w2)), 1, which.max)

# defining the two groups
os.groups <- matrix(ncol = np, nrow = ng)
for(i in 1:ng){
    if(dec[i]==1){
        os.groups[i,] <- ind1[i,] + 1}
    else{
        os.groups[i,] <- abs(ind2[i,] - 2)}
        }        


# function for calculating p-values and corresponding q-values of the logrank test
# groups = vector or matrix containing group labels, if a matrix the p-value is calculated for each row
# time = a vector of survival times
# event = the status indicator, 0=alive or right censored, 1=dead

library(survival)

lr.test <- function(groups, time, event){
    np <- length(time)
    surv <- function(groups){
        if(sum(groups)==np | sum(groups)==2*np | sum(is.na(groups))==np){
            lr.stat <- NA}
        else{
            lr.stat <- survdiff(Surv(time, event) ~ groups)$chisq}}
    if(is.null(dim(groups))){
        lr.stat <- surv(groups)
        lr.p <- 1-pchisq(lr.stat, df = 1)
        lr.q <- lr.p}
    else{
        lr.stat <- apply(groups, 1, surv)
        lr.p <- 1-pchisq(lr.stat, df = 1)
        lr.q <- p.adjust(lr.p, "fdr")}
    invisible(list(p.value = lr.p, q.value = lr.q))}