################################################################################# ## ## ## R-scripts for the manuscript ## ## ## ## Comparison and evaluation of clustering algorithms ## ## for tandem mass spectra ## ## ## ## ## ## by ## ## ## ## Vera Rieder, Karin U. Schork, Laura Kerschke, Bernhard Blank-Landeshammer, ## ## Albert Sickmann and Jörg Rahnenführer ## ## ## ################################################################################# ## load R packages library(BBmisc) library(checkmate) library(clusteval) library(igraph) library(stringr) ## Calculate distance/similarity matrix # calculateTotalSimilarity calculates the entire similarity vector # # Input: ## data - dataset with MS/MS spectr (RData) ## binning - logical, Is binning to be performed? default TRUE ## binsize - Parameter for binning, default 0.1 ## topn - logical, Reduce to top n intensities? default TRUE ## topn.n - natural number, number of top intensities # Output: ## similarity vector ## Warning! For larger datasets split calculation of cosine similarity calculateTotalSimilarity = function(data, binning = TRUE, binsize = 0.1, topn = TRUE, topn.n = 100) { ## preproces spectra (binning, select top n, etc.) spec = preprocessSpectra(data = data, binning = binning, binsize = binsize, topn = topn, n = topn.n) n = length(spec) ## original index vector (of column) index1 = rep(1:(n - 1), (n - 1) : 1) ## original index vector (of rows) index2 = unlist(lapply(2 : n, function(k) k : n)) ## calculate j-th subset of similarity vector s = vapply(1:(n * (n - 1)/2), function(k) calculateCosineSimilarity(spec[[index1[k]]], spec[[index2[k]]]), double(1L)) return(s) } # preprocessSpectra preprocesses spectra # # Input: ## data - dataset with MS/MS spectra (RData) ## binning - logical, Is binning to be performed? default TRUE ## binsize - Parameter for binning, default 0.1 ## topn - logical, Reduce to top n intensities? default TRUE ## n - natural number, number of top intensities # Output: ## list of preprocessed MS/MS spectra preprocessSpectra = function(data, binning = TRUE, binsize = 0.1, topn = TRUE, n = 100L) { ## Index of MS/MS spectra in data (LC/MSMS run) ix = data$ms2_index ## Extract masses and intensities for each relevant spectrum spec = lapply(data$mzxml[ix], function(x) cbind(x$spectrum$mass, x$spectrum$intensity)) ## If topn is TRUE, reduce each spectrum to top n intensities for each spectrum if (topn) { spec = lapply(spec, function(x) extractTopIntensities(x, n = n)) } ## Aggregate each spectrum with similar masses if(binning) { spec = lapply(spec, function(x) aggregateSpectrum(x, binsize = binsize)) } return(spec) } # calculateCosineSimilarity calculates Cosine similarity of two MS/MS spectra # # Input: ## x - matrix with two columns, first and second column containing ## masses and intensities of spectrum, respectively. ## y - matrix with two columns, first and second column containing ## masses and intensities of spectrum, respectively. # Output: ## real number between 0 and 1, cosine similarity (1 - cosine distance) calculateCosineSimilarity = function(x, y) { ## matching of peaks with same masses intersec = compare_vek(x, y) if (is.null(intersec)) { return(0L) } else { z1 = intersec[,2] / sqrt(sum(x[,2]^2)) z2 = intersec[,3] / sqrt(sum(y[,2]^2)) return(sum(z1 * z2)) } } # compare_vek finds peaks with same masses in two spectra # # Input: ## x - matrix with two columns, first and second column containing ## masses and intensities of spectrum, respectively. ## y - matrix with two columns, first and second column containing ## masses and intensities of spectrum, respectively. # Output: ## matrix with three columns, first column containing masses in both ## spectra, second and third column containing belonging intensities ## of spectrum x and y, respectively. compare_vek = function(x, y) { ## which masses in spectrum x occur also in spectrum y? x_in_y = x[, 1L] %in% y[, 1L] if (any(x_in_y)) { return(cbind(x[x_in_y, 1L], x[x_in_y, 2L], y[y[,1L] %in% x[, 1L], 2L])) } else { return(NULL) } } # extractTopIntensities extracts top n intensities per spectrum # # Input: ## spec - matrix with two columns, first and second column containing ## masses and intensities of spectrum, respectively. ## n - natural number, number of peaks that shall be kept # # Output: ## matrix with two columns, first and second column containing ## masses and intensities of reduced spectrum, respectively. extractTopIntensities = function(spec, n){ ## If a spectrum contains maximum n masses, keep it. if (length(spec[,1]) <= n) return(spec) ## Else, keep only values belonging to n highest intensities. int.sort = rev(order(spec[, 2L])) int.sort = int.sort[seq_len(n)] spec = spec[int.sort,] return(spec[order(spec[,1]), ]) } # aggregateSpectrum aggregates similar masses per spectrum (binning) # # Input: ## spec - matrix with two columns, first and second column containing ## masses and intensities of spectrum, respectively. ## binsize - Parameter for binning # # Output: ## matrix with two columns, first and second column containing ## masses and intensities of binned spectrum, respectively. aggregateSpectrum = function(spec, binsize = 0.2) { mass = floor(spec[, 1L] / binsize) * binsize + binsize / 2L int = spec[, 2L] if (anyDuplicated(mass)) { int = as.vector(tapply(int, mass, max)) mass = unique(mass) } return(matrix(c(mass, int), ncol = 2)) } ############################################################################### ## Different types of clustering algorithms ## CAST (Master thesis Laura Kerschke ######################################### ############################################################################### # Clustering Affinity Search Technique (CAST) # # Input: # S - Similarity vector (all elements must be between 0 and 1) # thres - Threshold, that seperates high / low affinities # # Output: # Vector, ith element represents cluster id of ith element cast = function(S, thres) { ## n: number of elements ## length(S) = n x (n - 1) / 2) ## <=> n^2 - n - 2*length(S) = 0 ## <=> n = 0.5 + sqrt(0.25 + 2 * length(S)) n = as.integer(0.5 + sqrt(0.25 + 2*length(S))) ## initiate "closed" clusters clustering = list() ## Vector of not assigned elements U = seq_len(n) ## Cluster Counter i = 0L ## While not assigned elements exist, do the following ... while (length(U) != 0) { i = i + 1L ## create "empty" cluster C.open = NULL ## Set all affinities to Zero a = rep(0, n) ## Set change-flag to TRUE changed = TRUE ## while any changes happen, repeat ADD and REMOVE while (changed) { changed = FALSE ## find element x of not assigned elements in U ## that can be added to current 'open' cluster x = addToCluster(U, thres, C.open, a) ## if such an element exists ... if (!is.null(x)) { ## .. add it to 'open' cluster C.open = sort(c(C.open, x)) ## .. and remove it from set of not assigned elements U = setdiff(U, x) ## ## Update affinities ## (1) Determine all relevant elements (either in U or in C.open) ind = c(U, C.open) ## (2) Extract affinities belonging to x and all these elements sim.x = getSimilarities(S, x, n) ## (3) Update affinities of all relevant elements a[ind] = a[ind] + sim.x[ind] ## CAST added one element. Thus something has changed. changed = TRUE } ## find element y in current 'open' cluster C.open that must be removed y = removeFromCluster(thres, C.open, a) ## if such an element exists ... if (!is.null(y)) { ## .. remove it from the 'open' cluster C.open = setdiff(C.open, y) ## .. and add it to set of not assigned elements U = sort(c(U, y)) ## Update all relevant affinities (analogous to steps in ADD) ind = c(U, C.open) sim.y = getSimilarities(S, y, n) a[ind] = a[ind] - sim.y[ind] changed = TRUE } } ## if no changes occur, close the current 'open' cluster and save it clustering[[i]] = C.open } labels = rep(NA, length(unlist(clustering))) for(i in 1:length(clustering)){ labels[clustering[[i]]] <- i } return(labels) } # addToCluster finds not assigned elements in U # that can be added to 'open' cluster C.open # Input: # U - Vector of not assigned elements # thres - threshold of affinities # C.open - Vector with indices of elements current belonging to 'open' cluster # a - Vector of affinities # # Output: # Either NULL (if no element is found) or index of elements that can be added addToCluster = function(U, thres, C.open, a) { ## if U is empty, CAST cannot choose elements if (length(U) == 0) return(NULL) ## if U contains at least one affine element if (max(a[U]) >= thres * length(C.open)) { ## choose those elements with maximum affinity u = U[a[U] == max(a[U])] ## if several elements have maximum affinity, choose one element at random if (length(u) > 1) u = sample(u, 1) return(as.integer(u)) } else { # if no affine element exists, none can be added return (NULL) } } # removeFromCluster finds element in current 'open' cluster C.open # that must be removed # # Input: # thres - threshold of affinities # C.open - Vector with indices of elements current belonging to 'open' cluster # a - Vector of affinities # # Output: # Either NULL (if no element can be removed) or index of element # that must be removed removeFromCluster = function(thres, C.open, a) { ## if cluster is empty, CAST cannot remove elements if (length(C.open) == 0) return(NULL) ## if C.open contains at least one low affine element ... if (min(a[C.open]) < thres * length(C.open)) { ## choose that element with minimum affinity u = C.open[a[C.open] == min(a[C.open])] ## if several elements have minimum affinity, choose one element at random if (length(u) > 1) u = sample(u, 1) return(as.integer(u)) } else { ## if no low affine element exists, none can be removed return (NULL) } } # getSimilarites extracts similarities between all elements and uth element # # Input: # S - Similarity vector (of initial lower triangular matrix) # u - integer specifying the element for which the similarities must be extracted # n - total number of elements in dataset # # Output: # numeric vector of length n containing all similarities between n elements and uth element getSimilarities = function(S, u, n) { ## If u == 1, the first (n - 1) elements of the similarity vector if (u == 1) return(c(1L, S[seq_len(n - 1)])) ## For all further calculations the indices are needed that specify the location ## where the last row of the similarity matrix is saved: (n - 1), (n - 1) + (n - 2), etc. last.row = cumsum(as.numeric((n - 1) : 1)) ## If u == n, last.row contains the wanted similarities if (u == n) return(c(S[last.row], 1L)) ## in all other cases the values of the uth row (part1) or the uth column (part2) ## of the triangular matrix must be extracted part1 = as.numeric(last.row[1:(u - 1)] - n + u) lr = last.row[u] part2 = as.numeric((lr - n + u + 1) : lr) return(c(S[part1], 1L, S[part2])) } ## End of CAST ################################################################ ############################################################################### ## DBSCAN (Master thesis Karin Schork) ######################################## ############################################################################### # Density Based Spatial Clustering of Applications with Noise (DBSCAN) # according to Pseudocode of Viswanath & Suresh Babu # # Input: # S - Similarity vector (all elements must be between 0 and 1) ## eps - positive real number ## minPts - natural number ## random - Shall the ordering of elements be randomized before clustering? ## (Boundary points can depend on this ordering) # Output: ## Vector, ith element represents cluster id of ith element DBSCAN = function(S, eps, minPts, random = FALSE) { n = length(S) N = calculateN(n) seen = logical(N) clust = numeric(N) cid = 0 if (random) { ind = sample(1:N) } else { ind = 1:N } for (i in ind) { if (seen[i] != TRUE) { seen[i] = TRUE neighbours = which(getMatrixEntry(S, i, 1:N, N = N) >= 1 - eps) card = length(neighbours) if (card < minPts) { clust[i] = 0 } else { cid = cid + 1 queue = neighbours while (length(queue) > 0) { y = queue[1] queue = queue[-1] seen[y] = TRUE neighbourhood = which(getMatrixEntry(S, y, 1:N, N = N) >= 1 - eps) card = length(neighbourhood) if (card >= minPts) { for (x in neighbourhood) { if (seen[x] == FALSE || clust[x] == 0) { if (seen[x] == FALSE) { queue = unique(c(queue, x)) } clust[x] = cid seen[x] = TRUE } } } } } } } labels <- clust clmax <- max(clust) labels[clust == 0] <- (clmax + 1):(clmax + sum(clust == 0)) return(labels) } # calculateN calculates from the length of the distance/similarity vector n # the number of elements N # # Input: ## n - length of distance/similarity vector # Output: ## Number of elements calculateN = function(n) { # n = ength of distance/similarity vector # N = Number of elements # n = (N choose 2) # => n = N * (N - 1) / 2 # => n = (N^2 - N) / 2 # => 0 = 1/2 N^2 - 1/2 N - n # => 0 = N^2 - N - 2*n # Solve by pq-formula with p = -1 and q = -2*n N = pq(p = -1, q = -2 * n) # number of elements must be greater than 0 N = N[N > 0] if (length(N) != 1) stop("Error: N cannot be calculated") return(N) } # pq is an implementation of pq-formula # # Input: ## p - real number ## q - real number # Output: ## Maximum two solutions of equation x^2 + px + q = 0. pq = function(p, q) { if ((p/2) ^ 2 - q > 0) { n_1 = -(p/2) - sqrt((p/2) ^ 2 - q) n_2 = -(p/2) + sqrt((p/2) ^ 2 - q) } else { n_1 = n_2 = NA } return(c(n_1, n_2)) } # getMatrixEntry gives the entries of distance/similarity vector # of element i and several elements in j # # Input: ## M - Distance/similarity vector ## i - one index ## j - several indices ## N - number of elements ## dist - Is M a distance vector? ## (Default FALSE: similarity vector) # Output: ## Vector of similarities or distances of element i to all elements in j getMatrixEntry = function(M, i, j, N = NULL, dist = FALSE) { # Indices smaller than i if (min(j) < i) { x1 = j[j < i] l = N * (x1 - 1) - x1 * (x1 - 1)/2 + i - x1 } else { l = NULL } # Indices greater than i if (max(j) > i) { x2 = j[j > i] u = N*(i - 1) - i*(i - 1) / 2 + x2 - i } else { u = NULL } # Index i if (i %in% j) { if (dist == TRUE) { m = 0 } else { m = 1 } } else { m = NULL } return(c(M[l], m, M[u])) } ## End of DBSCAN ############################################################## ############################################################################### ## hclust ##################################################################### ############################################################################### # hclust2 calculates hierarchical Complete Linkage Clustering. # # Input: ## S - Similarity vector ## h - positive real number # Output: ## Vector, ith element represents cluster id of ith element hclust2 = function(S, h) { n = length(S) N = calculateN(n) # Calculate number of neighbors for each element. nr.neigh = countNeighbours(S, N, h) # elements with more than one neighbor ind = which(nr.neigh != 1) rm(nr.neigh) gc() # Filter similarities of elements with more than one neighbor M = length(ind) m = choose(M, 2) d = numeric(m) for (k in 1:(M - 1)) { ind2 = getMatrixIndex(N = M, i = k, j = (k + 1):M) d[ind2] = getMatrixEntry(S, i = ind[k], j = ind[(k + 1):M], N = N, dist = FALSE) } rm(S); gc() d = 1 - d gc() # Transformation into dist object D = structure(d, Size = length(ind), Diag = FALSE, Upper = FALSE, class = "dist") rm(d); gc() # Call function hclust and cut dendrogram at level h. cl = hclust(D, method = "complete") clust_ind = cutree(cl, h = h) # Elements only with itself as neighbor would be singletons in hclust and get ID 0. clust = integer(N) clust[ind] = clust_ind labels <- clust clmax <- max(clust) labels[clust == 0] <- (clmax + 1):(clmax + sum(clust == 0)) return(labels) } # countNeighbours counts the number of neighbors of radius h for each element # # Input: ## S - similarity vector ## N - Number of elements ## h - maximum distance between two neighbors # Output: ## Vector of length N with number of neighbors of radius h for each element. ## Each element is his own neighbor. countNeighbours = function(S, N, h) { nr.neigh = integer(N) for (i in 1:(N - 1)) { # only look at similarities with i <= j neigh = which(getMatrixEntry(S, i, (i + 1):N, N = N, dist = FALSE) >= 1 - h) neigh = neigh + i # neigh contains neighbors of i with index greater i. # To the current number of neibhbors for i the length of neigh and # 1 (each element is his own neighbor) is added. nr.neigh[i] = nr.neigh[i] + length(neigh) + 1 # For all elements in neigh the number is increased by 1 # since i is a neighbor of them. nr.neigh[neigh] = nr.neigh[neigh] + 1 } nr.neigh[N] = nr.neigh[N] + 1 return(nr.neigh) } # getMatrixIndex outputs indices where the similarity between ith and jth element # stands in the similarity vector # # Input: ## N - number of elements ## i - one index ## j - several indices # Output: ## Vector of indices where the similarities between ith element and jth element stand. getMatrixIndex = function(N, i, j) { if (min(j) < i) { x1 = j[j < i] l = N * (x1 - 1) - x1 * (x1 - 1)/2 + i - x1 } else { l = NULL } if (max(j) > i) { x2 = j[j > i] u = N*(i - 1) - i*(i - 1) / 2 + x2 - i } else { u = NULL } return(c(l, u)) } ## End of hclust ############################################################## ############################################################################### ## igraph ##################################################################### ############################################################################### # igraph calculates clustering by each connected subgraph # # Input: ## S - Similarity vector ## csim - positive real number # Output: ## Vector, ith element represents cluster id of ith element igraph = function(S, csim = 0.7){ k = which(S > csim) n = as.integer(0.5 + sqrt(0.25 + 2*length(S))) if (length(k) == 0) { res = 1:n } else { fromto = sapply(k, function(x) k2ij(x, n)) nodes = paste0("A", 1:n) links = data.frame(from = nodes[fromto[1,]], to = nodes[fromto[2,]]) net = graph_from_data_frame(d = links, vertices = nodes, directed = F) cluster = igraph::clusters(net) res = cluster$membership } names(res) = NULL return(res) } # k2ij calculates row and column index in matrix given vector # # Input: ## k - ## n - length # Output: ## Vector of length 2, first row i and second column j. k2ij = function(k, n){ x = 1:(n-1) j = min(which(k <= x*(n - x/2 - 0.5))) if(j == 1) {i = k + 1} else { i = k - n*j + n + j^2/2 - j/2 + j} return(c(i, j)) } ## End of igraph ############################################################## ############################################################################### ## N-Cluster (Master thesis of Karin Schork ################################### ############################################################################### # neighbourClust calculates Neighbor Clustering # # Input: ## S - similarity vector ## c - positive real number # Output: ## Vector, ith element represents cluster id of ith element neighbourClust = function(S, c) { n = length(S) N = calculateN(n) # Calculate the number of neighbors for each element nr.neigh = countNeighbours(S, N, c) # Elements with only one neighbor are singletons (clust = 0) clust = rep(NA, N) clust[nr.neigh == 1] = 0 # elements with more than one neighbor # ID: elements that are in the race ID = which(nr.neigh != 1) clustID = 1 while (length(ID) > 0) { # element with most neighbors ID.max = ID[which.max(nr.neigh[ID])] # neighbors of that element ID.neigh = ID[which(getMatrixEntry(S, ID.max, ID, N = N, dist = FALSE) >= 1 - c)] # Assign all neighbors of that element to a Cluster clust[ID.neigh] = clustID # Reduce nr.neigh for all neighbors of ID.neigh in ID for (j in ID.neigh) { neigh = ID[which(getMatrixEntry(S, j, ID, N = N, dist = FALSE) >= 1 - c)] nr.neigh[neigh] = nr.neigh[neigh] - 1 } # ID: elements, not assigned to a cluster ID = which(is.na(clust)) clustID = clustID + 1 } labels = clust clmax = max(clust) labels[clust == 0] <- (clmax + 1):(clmax + sum(clust == 0)) return(labels) } ## End of N-Cluster ########################################################### ############################################################################### ## Warning! MS-Cluster and PRIDE Cluster only executable via ?command line? ## MS-Cluster ################################################################# ############################################################################### # mscluster is an R wrapper (BBmisc::system3) for MS-Cluster v2 algorithm, # an executable that can be run from the command line. # Frank et al. Spectral archives: extending spectral libraries to analyze both # identified and unidentified spectra. Nat. Methods 2011, 8, 587-591. # # Input: ## input - character vector (max length 2), file to MS/MS runs (mgf-files) ## similarity - similarity in [0,1], threshold in final round ## window - real number, --window , default X=2.0 Da. ## window width for clustering ## fragment.tolerance - real number, --fragment-tolerance , the tolerance in ## Da for fragment peaks (default X=0.34 Da.) ## mixture.prob - probability, --mixture-prob , the probability wrongfully ## adding a spectrum to cluster (default X=0.05). ## rounds - natural number, --num-rounds, number of rounds. ## path - path to MSCluster_bin. ## n1,n2,n3 - number of spectra of first/second/third MS/MS run # Output: ## Vector, ith element represents cluster id of ith element msclusternew = function(input, similarity, window, fragment.tolerance, mixture.prob, rounds, path = "~/MSCluster.20110327/MSCluster_bin", n1 = 0, n2 = 0, n3 = 0) { library(BBmisc) library(checkmate) assertFile(input) assertNumber(similarity, lower = 0, upper = 1) assertNumber(window) assertNumber(fragment.tolerance, lower = 0) assertNumber(mixture.prob, lower = 0, upper = 1) assertFile(path) assertNumber(rounds) # create input list input = normalizePath(input) input.list = tempfile("input", fileext = ".txt") writeLines(input, con = input.list) # create output directory output = tempfile("output") dir.create(output) # model dir model.dir = tempfile("models") dir.create(model.dir) args = c( sprintf("--list %s", shQuote(input.list)), sprintf("--similarity %f", similarity), sprintf("--out-dir %s", shQuote(output)), sprintf("--output-name %s", "out"), sprintf("--tmp-dir %s", shQuote(tempdir())), sprintf("--model-dir %s", shQuote(file.path(dirname(path), "Models"))), sprintf("--window %f", window), sprintf("--fragment-tolerance %f", fragment.tolerance), sprintf("--mixture-prob %f", mixture.prob), sprintf("--num-rounds %f", rounds) ) system3(normalizePath(path), args, wait = TRUE, stop.on.exit.code = TRUE) readBlock = function(lines) { list( header = strsplit(lines[1], "\t", fixed = TRUE)[[1L]], data = read.table(text = paste(lines, collapse = "\n"), sep = "\t", skip = 1) ) } res = list() fns = list.files(file.path(output, "clust"), pattern = "\\.clust$", full.names = TRUE) for (fn in fns) { lines = readLines(fn) grp = cumsum(!nzchar(lines)) keep = nzchar(lines) lines = unname(split(lines[keep], grp[keep])) res = c(res, lapply(lines, readBlock)) } if(length(input) == 1) { clust = lapply(res, function(x) x$data$V3)} if(length(input) == 2) { clust = lapply(res, function(x) c(0,n1)[(x$data$V2 + 1)] + x$data$V3)} if(length(input) == 3) { clust = lapply(res, function(x) c(0,n1,n1+n2)[(x$data$V2 + 1)] + x$data$V3)} labels = rep(NA, n1 + n2 + n3) for(i in 1:length(clust)){ labels[clust[[i]]] = i } return(labels) } ## End of MS-Cluster ########################################################## ############################################################################### ## PRIDE Cluster ############################################################## ############################################################################### # pridecluster is an R wrapper (BBmisc::system3) spectra-cluster-cli, # a stand-alone Java application of the algorithm. # Griss, et al. Recognizing millions of consistently unidentified spectra across # hundreds of shotgun proteomics datasets. Nat. Methods 2016, 13, 651-656. # # Input: ## input - character vector (max length 2), file to MS/MS run (mgf-file) ## fragment.tolerance - positive real number, -fragment_tolerance, fragment ion ## tolerance in m/z ro use for fragment peak matching ## precursor.tolerance - positive real number, -precursor_tolerance , precursor ## tolerance (clustering window size) in m/z used during matching ## rounds - natural number, -rounds , number of clustering rounds to use ## threshold.end - -threshold_end , (lowest) final clustering threshold ## threshold.start - -threshold_start , (highest) starting threshold ## path - path to spectra-cluster-cli-1.0.1.jar ## n1, n2, n3 - number of spectra of first/second/third MS/MS run # Output: ## list with entries clu and consensus, ### clu is a vector, ith element represents cluster id of ith element ### consensus is a list of consensus spectra. prideclusternew = function(input, fragment.tolerance, precursor.tolerance, rounds, threshold.end, threshold.start, path = "~/PRIDE/spectra-cluster-cli-1.0.1-bin/spectra-cluster-cli-1.0.1.jar", n1 = 0, n2 = 0, n3 = 0) { library(BBmisc) library(checkmate) name = unlist(strsplit(input, ".mgf")) name = strsplit(name, "/") name = unlist(lapply(name, function(x) x[length(name[[1]])])) assertFile(input) assertFile(path) assertNumber(fragment.tolerance, lower = 0) assertNumber(precursor.tolerance, lower = 0) assertNumber(rounds, lower = 1) assertNumber(threshold.end, lower = 0, upper = 1) assertNumber(threshold.start, lower = 0, upper = 1) path = normalizePath(path) input = normalizePath(input) # create output directory output = tempfile("output") dir.create(output) args = c( sprintf("-jar %s", path), sprintf("-output_path %s", shQuote(file.path(output, "result.clustering"))), sprintf("-fragment_tolerance %f", fragment.tolerance), sprintf("-precursor_tolerance %f", precursor.tolerance), sprintf("-rounds %s", rounds), sprintf("-threshold_end %f", threshold.end), sprintf("-threshold_start %f", threshold.start), sprintf(paste0(input, collapse = " ")) ) system3("java", args, wait = TRUE, stop.on.exit.code = TRUE) dat = readLines(file.path(output, "result.clustering")) clu = which(dat == "=Cluster=") n = length(clu) res = vector("list", n) res2 = vector("list", n) for(i in 1:(n - 1)){ clui = dat[clu[i]:(clu[i+1]-1)] speci = clui[which(grepl("SPEC", clui))] mzi = clui[which(grepl("consensus_mz=", clui))] mzi = as.numeric(unlist(strsplit(strsplit(mzi, "consensus_mz=")[[1]][2],","))) inti = clui[which(grepl("consensus_intens=", clui))] inti = as.numeric(unlist(strsplit(strsplit(inti, "consensus_intens=")[[1]][2],","))) res[[i]] = sapply(speci, function(x){ if(grepl(name[1], x)) res = as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) if(length(name) == 2){ if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) } if(length(name) == 3){ if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) if(grepl(name[3], x)) res = n1 + n2 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) } return(res) }, USE.NAMES = F) res2[[i]] = list(mass = mzi, int = inti) } # nth cluster clui = dat[clu[n]:length(dat)] speci = clui[which(grepl("SPEC", clui))] mzi = clui[which(grepl("consensus_mz=", clui))] mzi = as.numeric(unlist(strsplit(strsplit(mzi, "consensus_mz=")[[1]][2],","))) inti = clui[which(grepl("consensus_intens=", clui))] inti = as.numeric(unlist(strsplit(strsplit(inti, "consensus_intens=")[[1]][2],","))) res[[n]] = sapply(speci, function(x){ if(grepl(name[1], x)) res = as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) if(length(name) == 2){ if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) } if(length(name) == 3){ if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) if(grepl(name[3], x)) res = n1 + n2 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) } return(res) }, USE.NAMES = F) res2[[n]] = list(mass = mzi, int = inti) clu = rep(NA, n1 + n2 + n3) for(i in 1:length(res)){ clu[res[[i]]] = i } list(clu = clu, consensus = res2) } ## End of PRIDE Cluster ####################################################### ############################################################################### ############################################################################### ## Measures for evaluation of clusterings ##################################### ############################################################################### ## Adjusted Rand index (ARI) ################################################## ############################################################################### # # adjustedRandIndex calculates the Adjusted Rand Index # # Input: ## cluA - vector, ith element represents cluster id of ith element (clust. A) ## cluB - vector, ith element represents cluster id of ith element (clust. B) # Output: ## numeric, Adjusted Rand Index: Transformation of Rand Index, the fraction of ## pairs of clustered objects being classified both same or both different adjustedRandIndex = function(cluA, cluB) { tab = comembership_table(cluA, cluB) a = as.numeric(tab$n_11) b = as.numeric(tab$n_01) c = as.numeric(tab$n_10) d = choose(length(cluA), 2) - a - b - c x = 2 * (a * d - b * c) y = b ^ 2 + c ^ 2 + 2 * a * d + (a + d) * (c + b) ARI = x/y return(ARI) } ## End of Adjusted Rand index (ARI) ########################################### ############################################################################### ## Purity ##################################################################### ############################################################################### # # Purity calculates the Purity of ith cluster # Input ## clu - vector, ith element represents cluster id of ith element ## ann - vector, ith element represents peptide annotation of ith element ## j - cluster id # Output: ## numeric, Relative frequency of the most frequent annotation in ith cluster Purity = function(j, clu, ann){ tab = table(ann[which(clu == j)]) res = max(tab)/sum(tab) names(res) = paste0(names(which(tab == max(tab))), collapse = ", ") return(res) } ## End of Purity ############################################################## ############################################################################### ## Proportion of spectra remaining ############################################ ############################################################################### # # Spec_rem calculates the Proportion of spectra remaining # Input ## clu - vector, ith element represents cluster id of ith element # Output: ## numeric, Number of clusters relative to the number of spectra Spec_rem = function(clu){ res = length(table(clu))/length(clu) res } ## End of Proportion of spectra remaining ##################################### ############################################################################### ## Retainment of identified spectra ########################################## ############################################################################## # # Ret_id_spec calculates the Retainment of identified spectra # Input ## clu - vector, ith element represents cluster id of ith element ## ann - vector, ith element represents peptide annotation of ith element # Output: ## numeric, Number of different annotations regarding the most common ## annotation in each cluster relative to the total number of annotations Ret_id_spec = function(clu, ann){ clu = clu[which(!is.na(ann))] ann = ann[which(!is.na(ann))] ids = unique(clu) ann_com = vector("list", length(ids)) for(i in 1:length(ids)){ ann_com[[i]] = names(which.max(table(ann[which(clu == ids[i])]))) } res = length(unique(unlist(ann_com)))/length(unique(ann)) return(res) } ## End of Retainment of identified spectra #################################### ############################################################################### ## Proportion clustered spectra ############################################## ############################################################################## # Prop_clu_spec calculates the proportion of clustered spectra # Input ## clu - vector, ith element represents cluster id of ith element # Output: ## numeric, Proportion of spectra clustered with at least one other spectrum Prop_clu_spec = function(clu){ res = 1 - sum(table(clu) == 1)/length(clu) res } ## End of Proportion clustered spectra ######################################## ############################################################################### ## Proportion incorrectly clustered spectra ################################### ############################################################################### # Prop_inc_spec calculates the proportion of incorrectly clustered spectra # Input ## clu - vector, ith element represents cluster id of ith element ## ann - vector, ith element represents peptide annotation of ith element # Output: ## numeric, Proportion of spectra not identified as the most common annotation ## in the cluster Prop_inc_spec = function(clu, ann){ clu = clu[which(!is.na(ann))] ann = ann[which(!is.na(ann))] ids = unique(clu) inc_spec = numeric(length(ids)) for(i in 1:length(ids)){ tab = table(ann[which(clu == ids[i])]) inc_spec[i] = sum(tab) - max(tab) } res = sum(inc_spec)/length(ann) return(res) } ## End of Proportion incorrectly clustered spectra ############################# ################################################################################