######################################################################################### ## ## ## R-scripts for the manuscript ## ## ## ## Retention Time Alignment Algorithms for LC/MS Data must consider Nonlinear Shifts ## ## ## ## by ## ## Katharina Podwojski, Arno Fritsch, Daniel C. Chamrad, Wolfgang Paul, ## ## Barbara Sitek, Kai Stühler, Petra Mutzel, Christian Stephan, Helmut E. Meyer, ## ## Wolfgang Urfer, Katja Ickstadt, Jörg Rahnenführer ## ## ## ######################################################################################### ################################################ #### #### #### functions for retention-time alignment #### #### #### ################################################ # distance function of mass-to-charge (in ppm) for cluster analysis that penalizes # a strong retention-time deviation (default = 1 min) # or a strong deviation in intensity (default = factor 10) # data is matrix with m/z in column 1, retention time in column 2, and intensity in column 3 distmat.ppm2 <- function(data, diff.rt=1, diff.int=1){ peak.dist <- function(x,y){ abs(x[1]-y[1])/max(x[1],y[1])*10^6 + 1000*(abs(x[2]-y[2])>diff.rt)+1000*(abs(x[3]-y[3])>diff.int)} comp.dist <- function(i,data){ apply(data,1,peak.dist,y=data[i,]) } n <- dim(data)[1] as.dist(apply(matrix(1:n),1,comp.dist,data=data)) } # good.group returns list of indices of observations in 'well-behaved' clusters. # this function is launched within good.group.all # group = vector with cluster information # samples = vector with sample information # ndup = allowed number of samples that have two compounds in a cluster # nmiss = allowed number of samples that have no compound in a cluster good.group <- function(group, samples, pick, ndup =0 , nmiss=1){ find.good <- function(x){ (sum(x==0) <= nmiss & sum(x==2) <= ndup & sum(x>2)==0)} good <- as.list(which(apply(table(group,samples),1, find.good))) list.good <- lapply(good, function(x) pick[which(group==x)]) list.good} # good.group.all returns a list of indices of observations in 'well-behaved' clusters. # data = matrix with columns m/z, retention time, and intensity # width = width of mass windows, that are used for searching 'well-behaved' clusters # max.obs = if the window contains less than max.obs observations, these are used instead for # the search for 'well-behaved' clusters. If more than max.obs observations are contained # in the window, only the most intense peaks are used for alignment # over = overlap of mass windows # h = hight to cut cluster tree (should be set to .5 * mass accuracy) # diff.rt = maximal allowed retention-time deviation between runs good.group.all2 <- function(data, width=20, over=1, max.obs= 400, nmiss=1, ndup=0, h=50, method="average", diff.rt=1, diff.int=1){ n <- dim(data)[1] mz <- data$MZ intens <- data$Intens samp <- data[,"Sample"] names.samp <- as.numeric(names(table(samp))) nsamp <- length(names.samp) npick <- floor(max.obs/nsamp) ## number of observations per sample, if more than max.obs are in the window begin <- ppick <- 1 list.good <- NULL repeat{ nslice <- sum(mz >= mz[begin] & mz < mz[begin]+width) ## number of observations in window if(nslice <= max.obs){ pick <- begin:(begin + max.obs-1)} else{ ppick <- which(mz >= mz[begin] & mz < mz[begin]+width) ppick <- split(ppick, samp[ppick]) ppick.sort <- lapply(ppick, function(x) x[order(intens[x], decreasing=TRUE)]) pick <- unlist(lapply(ppick.sort, function(x) x[1:(min(npick, length(x)))])) } temp.dist <- distmat.ppm2(cbind(data[pick,"MZ"],data[pick,"RT"],log10(data[pick,"Intens"])),diff.rt=diff.rt, diff.int=diff.int) temp.group <- cutree(hclust(temp.dist,method=method),h=h) list.good <- c(list.good,good.group(temp.group, data[pick,"Sample"], pick=pick, nmiss=nmiss,ndup=ndup)) begin <- min(which(mz > mz[max(c(pick,unlist(ppick)))]-over)) if(n-begin < max.obs) break } pick <- begin:n temp.dist <- distmat.ppm2(cbind(data[pick,"MZ"],data[pick,"RT"],log10(data[pick,"Intens"])),diff.rt=diff.rt, diff.int=diff.int) temp.group <- cutree(hclust(temp.dist,method=method),h=h) list.good <- c(list.good,good.group(temp.group, data[pick,"Sample"], pick=pick, nmiss=nmiss,ndup=ndup)) dup.groups <- duplicated(unlist(lapply(list.good,function(x) sum(log(x))))) if(any(dup.groups)){ list.good <- list.good[-which(dup.groups)]} ## if there are any duplicated 'well-behaved' clusters, these are deleted list.good } # function to calculate several model-selection criteria for loess model # x = loess.modell # method = type of criterion loess.aic <- function(x,method=c("AICc","AIC","AICc1","GCV","BIC")) { method <- match.arg(method) n <- x$n sigma2 <- x$s^2 delta1 <- x$one.delta delta2 <- x$two.delta trace <- x$trace.hat enp <- x$enp if (method=="AIC") { aic <- log(sigma2) + 2*trace/n } if (method=="AICc") { aic <- log(sigma2) + 1 + 2*(trace+1)/(n-trace-2) } if (method=="AICc1") { aic <- n*log(sigma2) + n*(delta1/delta2*(n+enp))/(delta1^2/delta2-2) } if (method=="GCV") { aic <- log(sigma2) - 2*log(1-trace/n) } if (method=="BIC") { aic <- log(sigma2) + log(n)*trace/n } return(aic) } # function for calculating the retention-time correction with loess-regression # list.good= output of good.group.all2 # family, degree = parameter for loess # method = which model selection criterion should be used to select the correct span comp.RTcor <- function(list.good, data, family="gaussian",degree=1,method="BIC"){ devmed <- function(x) x - median(x) fill.flat <- function(pred.rtdev,RT.group){ ## fills in constant values in loess prediction outside of observed retention-time points maxRT <- max(RT.group[!is.na(pred.rtdev)]) minRT <- min(RT.group[!is.na(pred.rtdev)]) pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]] pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]] pred.rtdev } samp <- data[,"Sample"] names.samp <- as.numeric(names(table(data[,"Sample"]))) nsamp <- length(names.samp) RT <- data[,"RT"] rtdev <- cbind(RT=unlist(lapply(list.good,function(x) RT[x])), Sample=unlist(lapply(list.good, function(x) data[x,"Sample"])), rtdev=unlist(lapply(list.good, function(x) devmed(RT[x])))) opt.span <- vector(length=nsamp) RTcor <- numeric(nrow(data)) for(j in 1:nsamp){ pick <- which(rtdev[,"Sample"] == names.samp[j]) n <- length(pick) spans <- round(c(10/n,1),2) fit <- loess(rtdev[pick,3]~rtdev[pick,1],family=family,degree=degree, span=spans[1]) opti.aic <- function(span) { fit2 <- update(fit,span=span) return(loess.aic(fit2,method=method)) } if (spans[1]<1) { opt.span[j] <- optimize(opti.aic,spans)$minimum opt.span[j] <- ceiling(opt.span[j]*10000)/10000 } else opt.span[j] <- 1 fit2 <- update(fit,family=family,degree=degree,span=opt.span[j]) RT.group <- RT[samp==names.samp[j]] pred.rtdev <- predict(fit2, newdata= RT.group) pred.rtdev <- fill.flat(pred.rtdev,RT.group) RTcor[samp==names.samp[j]] <- RT.group- pred.rtdev } list(opt.span=opt.span,RTcor=RTcor,rtdev=rtdev) } # Plots of the retention-time deviation curves fitted with loess-regression of the individual samples # rtdev is output from comp.RTcor plot.rtdev <- function(rtdev,family="gaussian",degree=1,spans){ fill.flat <- function(pred.rtdev,RT.group){ ## fills in constant values in loess prediction outside of observed retention-time points maxRT <- max(RT.group[!is.na(pred.rtdev)]) minRT <- min(RT.group[!is.na(pred.rtdev)]) pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]] pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]] pred.rtdev } samp <- rtdev[,"Sample"] names.samp <- as.numeric(names(table(samp))) nsamp <- length(names.samp) for(i in 1:nsamp){ pick <- which(samp == names.samp[i]) loess.fit <- loess(rtdev[pick,3]~ rtdev[pick,1],degree=degree,family=family, span=spans[i]) x <- seq(min(rtdev[pick,1])-1, max(rtdev[pick,1]+1), by=0.1) y <- predict(loess.fit,newdata=x) y <- fill.flat(y,x) plot(rtdev[pick,1], rtdev[pick,3], xlab="RT",ylab="RT_Dev",xlim=range(x), main=paste("Sample ",names.samp[i],"\nSpan = ",spans[i],sep=""),cex=.8) lines(x,y,col="red") } } # function for calculating the retention-time correction with linear regression # list.good= output of good.group.all2 comp.RTcor.global <- function(list.good, data){ devmed <- function(x) x - median(x) fill.flat <- function(pred.rtdev,RT.group){ ## fills in constant values in prediction outside of observed retention-time points maxRT <- max(RT.group[!is.na(pred.rtdev)]) minRT <- min(RT.group[!is.na(pred.rtdev)]) pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]] pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]] pred.rtdev } samp <- data[,"Sample"] names.samp <- as.numeric(names(table(data[,"Sample"]))) nsamp <- length(names.samp) RT <- data[,"RT"] rtdev <- cbind(RT=unlist(lapply(list.good,function(x) RT[x])), Sample=unlist(lapply(list.good, function(x) data[x,"Sample"])), rtdev=unlist(lapply(list.good, function(x) devmed(RT[x])))) RTcor <- numeric(nrow(data)) for(j in 1:nsamp){ pick <- which(rtdev[,"Sample"] == names.samp[j]) n <- length(pick) fit <- lm(rtdev[pick,3]~rtdev[pick,1]) RT.group <- RT[samp==names.samp[j]] pred.rtdev <- fit$coef[1] + fit$coef[2]*RT.group pred.rtdev <- fill.flat(pred.rtdev,RT.group) RTcor[samp==names.samp[j]] <- RT.group- pred.rtdev } list(RTcor=RTcor,rtdev=rtdev) } # Plots of the retention-time deviation curves fitted with linear regression of the individual samples # rtdev is output from comp.RTcor plot.rtdev.global <- function(rtdev){ fill.flat <- function(pred.rtdev,RT.group){ ## fills in constant values in prediction outside of observed retention-time points maxRT <- max(RT.group[!is.na(pred.rtdev)]) minRT <- min(RT.group[!is.na(pred.rtdev)]) pred.rtdev[is.na(pred.rtdev) & RT.group > maxRT] <- pred.rtdev[which(RT.group==maxRT)[1]] pred.rtdev[is.na(pred.rtdev) & RT.group < minRT] <- pred.rtdev[which(RT.group==minRT)[1]] pred.rtdev } samp <- rtdev[,"Sample"] names.samp <- as.numeric(names(table(samp))) nsamp <- length(names.samp) for(i in 1:nsamp){ pick <- which(samp == names.samp[i]) lm.fit <- lm(rtdev[pick,3]~ rtdev[pick,1]) x <- seq(min(rtdev[pick,1])-1, max(rtdev[pick,1]+1), by=0.1) y <- lm.fit$coef[1] + lm.fit$coef[2]*x y <- fill.flat(y,x) plot(rtdev[pick,1], rtdev[pick,3], xlab="RT",ylab="RT_Dev",xlim=range(x), main=paste("Sample ",names.samp[i],sep=""),cex=.8) lines(x,y,col="red") } }