#########################################################################################
##                                                                                     ##
##                             R-scripts for the manuscript                            ##
##                                                                                     ##
##  Retention Time Alignment Algorithms for LC/MS Data must consider Nonlinear Shifts  ##
##                                                                                     ##
##                                          by                                         ##
##         Katharina Podwojski, Arno Fritsch, Daniel C. Chamrad, Wolfgang Paul,        ##
##     Barbara Sitek, Kai Stühler, Petra Mutzel, Christian Stephan, Helmut E. Meyer,   ##
##                  Wolfgang Urfer, Katja Ickstadt, Jörg Rahnenführer                  ##
##                                                                                     ##
#########################################################################################




################################################
####                                        ####
#### functions for retention-time alignment ####
####                                        ####
################################################

# distance function of mass-to-charge (in ppm) for cluster analysis that penalizes 
# a strong retention-time deviation (default = 1 min)
# or a strong deviation in intensity (default = factor 10) 
# data is matrix with m/z in column 1, retention time in column 2, and intensity in column 3

distmat.ppm2 <- function(data, diff.rt=1, diff.int=1){
  peak.dist <- function(x,y){
    abs(x[1]-y[1])/max(x[1],y[1])*10^6 + 1000*(abs(x[2]-y[2])>diff.rt)+1000*(abs(x[3]-y[3])>diff.int)}

  comp.dist <- function(i,data){
     apply(data,1,peak.dist,y=data[i,])
  }

  n <- dim(data)[1]
  as.dist(apply(matrix(1:n),1,comp.dist,data=data))
}




# good.group returns list of indices of observations in 'well-behaved' clusters.
# this function is launched within good.group.all
# group = vector with cluster information
# samples = vector with sample information
# ndup = allowed number of samples that have two compounds in a cluster
# nmiss = allowed number of samples that have no compound in a cluster

good.group <- function(group, samples, pick, ndup =0 , nmiss=1){
  find.good <- function(x){
      (sum(x==0) <= nmiss & sum(x==2) <= ndup & sum(x>2)==0)}
  good <- as.list(which(apply(table(group,samples),1, find.good)))
  list.good <- lapply(good, function(x) pick[which(group==x)])
  list.good}




# good.group.all returns a list of indices of observations in 'well-behaved' clusters.
# data = matrix with columns m/z, retention time, and intensity 
# width = width of mass windows, that are used for searching 'well-behaved' clusters
# max.obs = if the window contains less than max.obs observations, these are used instead for
#           the search for 'well-behaved' clusters. If more than max.obs observations are contained
#           in the window, only the most intense peaks are used for alignment
# over = overlap of mass windows
# h = hight to cut cluster tree (should be set to  .5 * mass accuracy)
# diff.rt = maximal allowed retention-time deviation between runs

good.group.all2 <- function(data, width=20, over=1, max.obs= 400, nmiss=1, ndup=0, h=50, method="average", diff.rt=1, diff.int=1){
    n <- dim(data)[1]
    mz <- data$MZ
    intens <- data$Intens
    samp <- data[,"Sample"]
    names.samp <- as.numeric(names(table(samp)))
    nsamp <- length(names.samp)
    npick <- floor(max.obs/nsamp) ## number of observations per sample, if more than max.obs are in the window
    begin <- ppick <- 1
    list.good <- NULL
    repeat{
      nslice <- sum(mz >= mz[begin] & mz < mz[begin]+width) ## number of observations in window
      if(nslice <= max.obs){
          pick <- begin:(begin + max.obs-1)} else{
          ppick <- which(mz >= mz[begin] & mz < mz[begin]+width)
          ppick <-  split(ppick, samp[ppick])
          ppick.sort <-   lapply(ppick, function(x) x[order(intens[x], decreasing=TRUE)])
          pick <- unlist(lapply(ppick.sort, function(x) x[1:(min(npick, length(x)))]))
          }
      temp.dist <- distmat.ppm2(cbind(data[pick,"MZ"],data[pick,"RT"],log10(data[pick,"Intens"])),diff.rt=diff.rt, diff.int=diff.int)
      temp.group <- cutree(hclust(temp.dist,method=method),h=h)
      list.good <- c(list.good,good.group(temp.group, data[pick,"Sample"], pick=pick, nmiss=nmiss,ndup=ndup))
    begin <- min(which(mz > mz[max(c(pick,unlist(ppick)))]-over))
    if(n-begin < max.obs) break
    }
    pick <- begin:n
    temp.dist <- distmat.ppm2(cbind(data[pick,"MZ"],data[pick,"RT"],log10(data[pick,"Intens"])),diff.rt=diff.rt, diff.int=diff.int)
    temp.group <- cutree(hclust(temp.dist,method=method),h=h)
    list.good <- c(list.good,good.group(temp.group, data[pick,"Sample"], pick=pick, nmiss=nmiss,ndup=ndup))

    dup.groups <-  duplicated(unlist(lapply(list.good,function(x) sum(log(x)))))
    if(any(dup.groups)){
                list.good <- list.good[-which(dup.groups)]}
                ## if there are any duplicated 'well-behaved' clusters, these are deleted
  list.good
}




# function to calculate several model-selection criteria for loess model
# x = loess.modell   
# method = type of criterion

loess.aic <- function(x,method=c("AICc","AIC","AICc1","GCV","BIC")) {           
  method <- match.arg(method)
  n <- x$n
  sigma2 <- x$s^2
	delta1 <- x$one.delta
	delta2 <- x$two.delta
	trace <- x$trace.hat
	enp <- x$enp
  if (method=="AIC") { aic <- log(sigma2) + 2*trace/n }
  if (method=="AICc") { aic <- log(sigma2) + 1 + 2*(trace+1)/(n-trace-2) }
  if (method=="AICc1") { aic <- n*log(sigma2) + n*(delta1/delta2*(n+enp))/(delta1^2/delta2-2) }
  if (method=="GCV") { aic <- log(sigma2) - 2*log(1-trace/n) }
  if (method=="BIC") { aic <- log(sigma2) + log(n)*trace/n }
  return(aic)
}




# function for calculating the retention-time correction with loess-regression
# list.good= output of good.group.all2
# family, degree = parameter for loess
# method = which model selection criterion should be used to select the correct span

comp.RTcor <- function(list.good, data, family="gaussian",degree=1,method="BIC"){
    devmed <- function(x) x - median(x)

    fill.flat <- function(pred.rtdev,RT.group){
      ## fills in constant values in loess prediction outside of observed retention-time points
      maxRT <- max(RT.group[!is.na(pred.rtdev)])
      minRT <- min(RT.group[!is.na(pred.rtdev)])
      pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]]
      pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]]
      pred.rtdev
    }

    samp <- data[,"Sample"]
    names.samp <- as.numeric(names(table(data[,"Sample"])))
    nsamp <- length(names.samp)
    RT <- data[,"RT"]

    rtdev <- cbind(RT=unlist(lapply(list.good,function(x) RT[x])),
        Sample=unlist(lapply(list.good, function(x) data[x,"Sample"])),
        rtdev=unlist(lapply(list.good, function(x) devmed(RT[x]))))
    opt.span <- vector(length=nsamp)
    RTcor <- numeric(nrow(data))

    for(j in 1:nsamp){
        pick <- which(rtdev[,"Sample"] == names.samp[j])
        n <- length(pick)
        spans <- round(c(10/n,1),2)
        fit <- loess(rtdev[pick,3]~rtdev[pick,1],family=family,degree=degree, span=spans[1])

        opti.aic <- function(span) {
          fit2 <- update(fit,span=span)
          return(loess.aic(fit2,method=method))
        }
        if (spans[1]<1) {
          opt.span[j] <- optimize(opti.aic,spans)$minimum
          opt.span[j] <- ceiling(opt.span[j]*10000)/10000
          }
        else opt.span[j] <- 1
        fit2 <- update(fit,family=family,degree=degree,span=opt.span[j])
        RT.group <- RT[samp==names.samp[j]]
        pred.rtdev <-  predict(fit2, newdata= RT.group)
        pred.rtdev <- fill.flat(pred.rtdev,RT.group)
        RTcor[samp==names.samp[j]] <- RT.group- pred.rtdev
    }
    list(opt.span=opt.span,RTcor=RTcor,rtdev=rtdev)
}




# Plots of the retention-time deviation curves fitted with loess-regression of the individual samples
# rtdev is output from comp.RTcor

plot.rtdev <- function(rtdev,family="gaussian",degree=1,spans){
  fill.flat <- function(pred.rtdev,RT.group){
      ## fills in constant values in loess prediction outside of observed retention-time points
      maxRT <- max(RT.group[!is.na(pred.rtdev)])
      minRT <- min(RT.group[!is.na(pred.rtdev)])
      pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]]
      pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]]
      pred.rtdev
    }
    samp <- rtdev[,"Sample"]
    names.samp <- as.numeric(names(table(samp)))
    nsamp <- length(names.samp)

    for(i in 1:nsamp){
      pick <- which(samp == names.samp[i])
      loess.fit <- loess(rtdev[pick,3]~ rtdev[pick,1],degree=degree,family=family, span=spans[i])
      x <- seq(min(rtdev[pick,1])-1, max(rtdev[pick,1]+1), by=0.1)
      y <- predict(loess.fit,newdata=x)
      y <- fill.flat(y,x)
      plot(rtdev[pick,1], rtdev[pick,3], xlab="RT",ylab="RT_Dev",xlim=range(x), main=paste("Sample ",names.samp[i],"\nSpan = ",spans[i],sep=""),cex=.8)
    lines(x,y,col="red")
    }
}




# function for calculating the retention-time correction with linear regression
# list.good= output of good.group.all2

comp.RTcor.global <- function(list.good, data){
    devmed <- function(x) x - median(x)

    fill.flat <- function(pred.rtdev,RT.group){
      ## fills in constant values in prediction outside of observed retention-time points
      maxRT <- max(RT.group[!is.na(pred.rtdev)])
      minRT <- min(RT.group[!is.na(pred.rtdev)])
      pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]]
      pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]]
      pred.rtdev
    }

    samp <- data[,"Sample"]
    names.samp <- as.numeric(names(table(data[,"Sample"])))
    nsamp <- length(names.samp)
    RT <- data[,"RT"]

    rtdev <- cbind(RT=unlist(lapply(list.good,function(x) RT[x])),
        Sample=unlist(lapply(list.good, function(x) data[x,"Sample"])),
        rtdev=unlist(lapply(list.good, function(x) devmed(RT[x]))))
    RTcor <- numeric(nrow(data))

    for(j in 1:nsamp){
        pick <- which(rtdev[,"Sample"] == names.samp[j])
        n <- length(pick)
        fit <- lm(rtdev[pick,3]~rtdev[pick,1])
        RT.group <- RT[samp==names.samp[j]]
        pred.rtdev <-  fit$coef[1] + fit$coef[2]*RT.group
        pred.rtdev <- fill.flat(pred.rtdev,RT.group)
        RTcor[samp==names.samp[j]] <- RT.group- pred.rtdev
    }
    list(RTcor=RTcor,rtdev=rtdev)
}




# Plots of the retention-time deviation curves fitted with linear regression of the individual samples
# rtdev is output from comp.RTcor

plot.rtdev.global <- function(rtdev){
  fill.flat <- function(pred.rtdev,RT.group){
      ## fills in constant values in prediction outside of observed retention-time points
      maxRT <- max(RT.group[!is.na(pred.rtdev)])
      minRT <- min(RT.group[!is.na(pred.rtdev)])
      pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]]
      pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]]
      pred.rtdev
    }
    samp <- rtdev[,"Sample"]
    names.samp <- as.numeric(names(table(samp)))
    nsamp <- length(names.samp)

    for(i in 1:nsamp){
      pick <- which(samp == names.samp[i])
      lm.fit <- lm(rtdev[pick,3]~ rtdev[pick,1])
      x <- seq(min(rtdev[pick,1])-1, max(rtdev[pick,1]+1), by=0.1)
      y <- lm.fit$coef[1] + lm.fit$coef[2]*x
      y <- fill.flat(y,x)
      plot(rtdev[pick,1], rtdev[pick,3], xlab="RT",ylab="RT_Dev",xlim=range(x), main=paste("Sample ",names.samp[i],sep=""),cex=.8)
    lines(x,y,col="red")
    }
}