#################################################################################
##                                                                             ##
##                      R-scripts for the manuscript                           ##
##                                                                             ##
##               Comparison and evaluation of clustering algorithms            ##
##                            for tandem mass spectra                          ##
##                                                                             ##
##                                                                             ##
##                                  by                                         ##
##                                                                             ##
##  Vera Rieder, Karin U. Schork, Laura Kerschke, Bernhard Blank-Landeshammer, ##
##  Albert Sickmann and Jörg Rahnenführer                                      ##
##                                                                             ##
#################################################################################

## load R packages

library(BBmisc)
library(checkmate)
library(clusteval)
library(igraph)
library(stringr)

## Calculate distance/similarity matrix

# calculateTotalSimilarity calculates the entire similarity vector
#
# Input:
## data    - dataset with MS/MS spectr (RData)
## binning - logical, Is binning to be performed? default TRUE 
## binsize - Parameter for binning, default 0.1
## topn    - logical, Reduce to top n intensities? default TRUE
## topn.n  - natural number, number of top intensities
# Output:
## similarity vector
## Warning! For larger datasets split calculation of cosine similarity
calculateTotalSimilarity = function(data, binning = TRUE, binsize = 0.1,
                                    topn = TRUE, topn.n = 100) {

  ## preproces spectra (binning, select top n, etc.)
  spec = preprocessSpectra(data = data, binning = binning, binsize = binsize, 
                           topn = topn, n = topn.n)
  n = length(spec)
  ## original index vector (of column)
  index1 = rep(1:(n - 1), (n - 1) : 1)
  ## original index vector (of rows)
  index2 = unlist(lapply(2 : n, function(k) k : n))
   ## calculate j-th subset of similarity vector
  s = vapply(1:(n * (n - 1)/2), function(k) calculateCosineSimilarity(spec[[index1[k]]], 
                                                           spec[[index2[k]]]), 
                                 double(1L))
  return(s)
}

# preprocessSpectra preprocesses spectra
#
# Input:
## data    - dataset with MS/MS spectra (RData)
## binning - logical, Is binning to be performed? default TRUE 
## binsize - Parameter for binning, default 0.1
## topn    - logical, Reduce to top n intensities? default TRUE
## n       - natural number, number of top intensities
# Output:
## list of preprocessed MS/MS spectra
preprocessSpectra = function(data, binning = TRUE, binsize = 0.1, 
                             topn = TRUE, n = 100L) {
  
  ## Index of MS/MS spectra in data (LC/MSMS run) 
  ix = data$ms2_index
  
  ## Extract masses and intensities for each relevant spectrum 
  spec = lapply(data$mzxml[ix], function(x) cbind(x$spectrum$mass, 
                                                  x$spectrum$intensity))
  
  ## If topn is TRUE, reduce each spectrum to top n intensities for each spectrum
  if (topn) {
    spec = lapply(spec, function(x) extractTopIntensities(x, n = n))
  }
  
  ## Aggregate each spectrum with similar masses 
  if(binning) {
    spec = lapply(spec, function(x) aggregateSpectrum(x, binsize = binsize))
  }
  
  return(spec)
}

# calculateCosineSimilarity calculates Cosine similarity of two MS/MS spectra
#
# Input:
## x - matrix with two columns, first and second column containing
##     masses and intensities of spectrum, respectively.
## y - matrix with two columns, first and second column containing
##     masses and intensities of spectrum, respectively.
# Output:
## real number between 0 and 1, cosine similarity (1 - cosine distance)
calculateCosineSimilarity = function(x, y) {
  ## matching of peaks with same masses
  intersec = compare_vek(x, y)
  
  if (is.null(intersec)) {
    return(0L)
  } else {
    z1 = intersec[,2] / sqrt(sum(x[,2]^2))
    z2 = intersec[,3] / sqrt(sum(y[,2]^2))
    return(sum(z1 * z2))
  }
}

# compare_vek finds peaks with same masses in two spectra
#
# Input:
## x - matrix with two columns, first and second column containing
##     masses and intensities of spectrum, respectively.
## y - matrix with two columns, first and second column containing
##     masses and intensities of spectrum, respectively.
# Output:
## matrix with three columns, first column containing masses in both
## spectra, second and third column containing belonging intensities
## of spectrum x and y, respectively.


compare_vek = function(x, y) {
  ## which masses in spectrum x occur also in spectrum y?
  x_in_y = x[, 1L] %in% y[, 1L]
  if (any(x_in_y)) {
    return(cbind(x[x_in_y, 1L], x[x_in_y, 2L], y[y[,1L] %in% x[, 1L], 2L]))
  } else {
    return(NULL)
  }
}

# extractTopIntensities extracts top n intensities per spectrum
# 
# Input:
## spec - matrix with two columns, first and second column containing
##        masses and intensities of spectrum, respectively.
## n    - natural number, number of peaks that shall be kept
#
# Output:
## matrix with two columns, first and second column containing
## masses and intensities of reduced spectrum, respectively.
extractTopIntensities = function(spec, n){
  
  ## If a spectrum contains maximum n masses, keep it.
  if (length(spec[,1]) <= n)
    return(spec)
  
  ## Else, keep only values belonging to n highest intensities.
  int.sort = rev(order(spec[, 2L]))
  int.sort = int.sort[seq_len(n)]
  
  spec = spec[int.sort,]
  return(spec[order(spec[,1]), ])
}


# aggregateSpectrum aggregates similar masses per spectrum (binning)
# 
# Input:
## spec    - matrix with two columns, first and second column containing
##           masses and intensities of spectrum, respectively.
## binsize - Parameter for binning
#
# Output:
## matrix with two columns, first and second column containing
## masses and intensities of binned spectrum, respectively. 
aggregateSpectrum = function(spec, binsize = 0.2) {
  
  mass = floor(spec[, 1L] / binsize) * binsize + binsize / 2L
  int = spec[, 2L]
  
  if (anyDuplicated(mass)) {
    int = as.vector(tapply(int, mass, max))
    mass = unique(mass)
  }
  
  return(matrix(c(mass, int), ncol = 2))
}

###############################################################################
## Different types of clustering algorithms

## CAST (Master thesis Laura Kerschke #########################################
###############################################################################

# Clustering Affinity Search Technique (CAST)
# 
# Input:
#   S     - Similarity vector (all elements must be between 0 and 1)
#   thres - Threshold, that seperates high / low affinities
#
# Output:
#   Vector, ith element represents cluster id of ith element 
cast = function(S, thres) {

  ## n: number of elements 
  ## length(S) = n x (n - 1) / 2)
  ## <=> n^2 - n - 2*length(S) = 0
  ## <=> n = 0.5 + sqrt(0.25 + 2 * length(S))
  n = as.integer(0.5 + sqrt(0.25 + 2*length(S)))
  
  ## initiate "closed" clusters
  clustering = list()
  ## Vector of not assigned elements
  U = seq_len(n)
  ## Cluster Counter
  i = 0L
  ## While not assigned elements exist, do the following ...
  while (length(U) != 0) {
    i = i + 1L
    ## create "empty" cluster
    C.open = NULL
    ## Set all affinities to Zero 
    a = rep(0, n)
    ## Set change-flag to TRUE
    changed = TRUE
    ## while any changes happen, repeat ADD and REMOVE
    while (changed) {
      changed = FALSE
      ## find element x of not assigned elements in U 
      ## that can be added to current 'open' cluster
      x = addToCluster(U, thres, C.open, a)
      ## if such an element exists ...
      if (!is.null(x)) {
        ## .. add it to 'open' cluster
        C.open = sort(c(C.open, x))
        ## .. and remove it from set of not assigned elements
        U = setdiff(U, x)
        ## 
        ## Update affinities
        ## (1) Determine all relevant elements (either in U or in C.open)
        ind = c(U, C.open)
        ## (2) Extract affinities belonging to x and all these elements
        sim.x = getSimilarities(S, x, n)
        ## (3) Update affinities of all relevant elements
        a[ind] = a[ind] + sim.x[ind]
        ## CAST added one element. Thus something has changed.
        changed = TRUE
      }
      
      ## find element y in current 'open' cluster C.open that must be removed
      y = removeFromCluster(thres, C.open, a)
      ## if such an element exists ... 
      if (!is.null(y)) {
        ## .. remove it from the 'open' cluster
        C.open = setdiff(C.open, y)
        ## .. and add it to set of not assigned elements
        U = sort(c(U, y))
        ## Update all relevant affinities (analogous to steps in ADD)
        ind = c(U, C.open)
        sim.y = getSimilarities(S, y, n)
        a[ind] = a[ind] - sim.y[ind]
        changed = TRUE
      }
    }
    ## if no changes occur, close the current 'open' cluster and save it
    clustering[[i]] = C.open
  }
  
  labels = rep(NA, length(unlist(clustering)))
  for(i in 1:length(clustering)){
    labels[clustering[[i]]] <- i
  }
  return(labels)
}


# addToCluster finds not assigned elements in U 
# that can be added to 'open' cluster C.open 
# Input:
#   U      - Vector of not assigned elements
#   thres  - threshold of affinities
#   C.open - Vector with indices of elements current belonging to 'open' cluster
#   a      - Vector of affinities
#
# Output:
#   Either NULL (if no element is found) or index of elements that can be added
addToCluster = function(U, thres, C.open, a) {
  ## if U is empty, CAST cannot choose elements
  if (length(U) == 0)
    return(NULL)
  ## if U contains at least one affine element
  if (max(a[U]) >= thres * length(C.open)) {
    ## choose those elements with maximum affinity 
    u = U[a[U] == max(a[U])]
    ## if several elements have maximum affinity, choose one element at random
    if (length(u) > 1)
      u = sample(u, 1)
    return(as.integer(u))
  } else {
    # if no affine element exists, none can be added
    return (NULL)
  }
}


# removeFromCluster finds element in current 'open' cluster C.open 
# that must be removed 
#
# Input:
#   thres  - threshold of affinities
#   C.open - Vector with indices of elements current belonging to 'open' cluster
#   a      - Vector of affinities
#
# Output:
#   Either NULL (if no element can be removed) or index of element 
#   that must be removed
removeFromCluster = function(thres, C.open, a) {
  ## if cluster is empty, CAST cannot remove elements
  if (length(C.open) == 0)
    return(NULL)
  ## if C.open contains at least one low affine element ... 
  if (min(a[C.open]) < thres * length(C.open)) {
    ## choose that element with minimum affinity 
    u = C.open[a[C.open] == min(a[C.open])]
    ## if several elements have minimum affinity, choose one element at random
    if (length(u) > 1)
      u = sample(u, 1)
    return(as.integer(u))
  } else {
    ## if no low affine element exists, none can be removed
    return (NULL)
  }
}


# getSimilarites extracts similarities between all elements and uth element
#
# Input:
#   S  - Similarity vector (of initial lower triangular matrix) 
#   u  - integer specifying the element for which the similarities must be extracted
#   n  - total number of elements in dataset
#
# Output:
#   numeric vector of length n containing all similarities between n elements and uth element
getSimilarities = function(S, u, n) {
  ## If u == 1, the first (n - 1) elements of the similarity vector
  if (u == 1)
    return(c(1L, S[seq_len(n - 1)]))
  ## For all further calculations the indices are needed that specify the location 
  ## where the last row of the similarity matrix is saved: (n - 1), (n - 1) + (n - 2), etc.
  last.row = cumsum(as.numeric((n - 1) : 1))
  ## If u == n, last.row contains the wanted similarities 
  if (u == n)
    return(c(S[last.row], 1L))
  ## in all other cases the values of the uth row (part1) or the uth column (part2) 
  ## of the triangular matrix must be extracted
  part1 = as.numeric(last.row[1:(u - 1)] - n + u)
  lr = last.row[u]
  part2 = as.numeric((lr - n + u + 1) : lr)
  return(c(S[part1], 1L, S[part2]))
}

## End of CAST ################################################################
###############################################################################

## DBSCAN (Master thesis Karin Schork) ########################################
###############################################################################

# Density Based Spatial Clustering of Applications with Noise (DBSCAN) 
# according to Pseudocode of Viswanath & Suresh Babu
#
# Input:
#   S     - Similarity vector (all elements must be between 0 and 1)
## eps    - positive real number
## minPts - natural number
## random - Shall the ordering of elements be randomized before clustering?
##          (Boundary points can depend on this ordering)
# Output:
## Vector, ith element represents cluster id of ith element 
DBSCAN = function(S, eps, minPts, random = FALSE) {
  n = length(S)
  N = calculateN(n)
  
  seen = logical(N)
  clust = numeric(N)
  cid = 0
  
  if (random) {
    ind = sample(1:N)
  } else {
    ind = 1:N
  }
  
  for (i in ind) {
    if (seen[i] != TRUE) {
      seen[i] = TRUE
      
      neighbours = which(getMatrixEntry(S, i, 1:N, N = N) >= 1 - eps)
      card = length(neighbours)
      
      if (card < minPts) {
        clust[i] = 0
      } else {
        cid = cid + 1
        
        queue = neighbours
        
        while (length(queue) > 0) {
          y = queue[1]
          queue = queue[-1]
          seen[y] = TRUE
          
          neighbourhood = which(getMatrixEntry(S, y, 1:N, N = N) >= 1 - eps)
          card = length(neighbourhood)
          
          if (card >= minPts) {
            for (x in neighbourhood) {
              if (seen[x] == FALSE || clust[x] == 0) {
                if (seen[x] == FALSE) {
                  queue = unique(c(queue, x))
                }
                clust[x] = cid
                seen[x] = TRUE
              }
            }
          }
          
          
        }
      }
    }
  }

  labels <- clust
  clmax <- max(clust)
  labels[clust == 0] <- (clmax + 1):(clmax + sum(clust == 0))
  return(labels)
}

# calculateN calculates from the length of the distance/similarity vector n 
# the number of elements N
#
# Input:
## n - length of distance/similarity vector
# Output:
## Number of elements
calculateN = function(n) {
  # n = ength of distance/similarity vector
  # N = Number of elements
  # n = (N choose 2)
  # => n = N * (N - 1) / 2
  # => n = (N^2 - N) / 2
  # => 0 = 1/2 N^2 - 1/2 N - n
  # => 0 = N^2 - N - 2*n
  # Solve by pq-formula with p = -1 and q = -2*n
  N = pq(p = -1, q = -2 * n)
  # number of elements must be greater than 0
  N = N[N > 0]
  if (length(N) != 1) stop("Error: N cannot be calculated")
  return(N)
}

# pq is an implementation of pq-formula
#
# Input:
## p - real number
## q - real number
# Output:
## Maximum two solutions of equation x^2 + px + q = 0. 
pq = function(p, q) {
  if ((p/2) ^ 2 - q > 0) {
    n_1 = -(p/2) - sqrt((p/2) ^ 2 - q)
    n_2 = -(p/2) + sqrt((p/2) ^ 2 - q)
  } else {
    n_1 = n_2 = NA
  }
  return(c(n_1, n_2))
}

# getMatrixEntry gives the entries of distance/similarity vector 
# of element i and several elements in j
# 
# Input:
## M    - Distance/similarity vector
## i    - one index
## j    - several indices
## N    - number of elements
## dist - Is M a distance vector?
##  (Default FALSE: similarity vector) 
# Output:
## Vector of similarities or distances of element i to all elements in j 
getMatrixEntry = function(M, i, j, N = NULL, dist = FALSE) {
  # Indices smaller than i
  if (min(j) < i) {
    x1 = j[j < i]
    l = N * (x1 - 1) - x1 * (x1 - 1)/2 + i - x1
  } else {
    l = NULL
  }
  
  # Indices greater than i
  if (max(j) > i) {
    x2 = j[j > i]
    u = N*(i - 1) - i*(i - 1) / 2 + x2 - i
  } else {
    u = NULL
  }
  
  # Index i
  if (i %in% j) {
    if (dist == TRUE) {
      m = 0
    } else {
      m = 1
    }
  } else {
    m = NULL
  }
  
  return(c(M[l], m, M[u]))
}

## End of DBSCAN ##############################################################
###############################################################################

## hclust #####################################################################
###############################################################################
# hclust2 calculates hierarchical Complete Linkage Clustering.
# 
# Input: 
## S - Similarity vector
## h - positive real number
# Output:
## Vector, ith element represents cluster id of ith element 
hclust2 = function(S, h) {
  
  n = length(S)
  N = calculateN(n)
  
  # Calculate number of neighbors for each element.
  nr.neigh = countNeighbours(S, N, h) 
  
  # elements with more than one neighbor
  ind = which(nr.neigh != 1)
  rm(nr.neigh)
  gc()
  
  # Filter similarities of elements with more than one neighbor
  M = length(ind)
  m = choose(M, 2)
  d = numeric(m)
  for (k in 1:(M - 1)) {
    ind2 = getMatrixIndex(N = M, i = k, j = (k + 1):M)
    d[ind2] = getMatrixEntry(S, i = ind[k], j = ind[(k + 1):M], N = N, dist = FALSE)
  }
  
  rm(S); gc()
  
  d = 1 - d
  
  gc()
  
  # Transformation into dist object
  D = structure(d, Size = length(ind), Diag = FALSE, Upper = FALSE, class = "dist")
  
  rm(d); gc()
  
  # Call function hclust and cut dendrogram at level h.
  cl = hclust(D, method = "complete")
  clust_ind = cutree(cl, h = h)
  
  # Elements only with itself as neighbor would be singletons in hclust and get ID 0.
  clust = integer(N)
  clust[ind] = clust_ind
  labels <- clust
  clmax <- max(clust)
  labels[clust == 0] <- (clmax + 1):(clmax + sum(clust == 0))
  return(labels)
  
}

# countNeighbours counts the number of neighbors of radius h for each element
#
# Input: 
## S - similarity vector
## N - Number of elements
## h - maximum distance between two neighbors
# Output:
## Vector of length N with number of neighbors of radius h for each element. 
## Each element is his own neighbor.
countNeighbours = function(S, N, h) {
  nr.neigh = integer(N)
  
  for (i in 1:(N - 1)) {
    # only look at similarities with i <= j 
    neigh = which(getMatrixEntry(S, i, (i + 1):N, N = N, dist = FALSE) >= 1 - h)
    neigh = neigh + i
    # neigh contains neighbors of i with index greater i.
    # To the current number of neibhbors for i the length of neigh and 
    # 1 (each element is his own neighbor) is added.
    nr.neigh[i] = nr.neigh[i] + length(neigh) + 1
    # For all elements in neigh the number is increased by 1 
    # since i is a neighbor of them.
    nr.neigh[neigh] = nr.neigh[neigh] + 1
  }
  nr.neigh[N] = nr.neigh[N] + 1
  return(nr.neigh)
}

# getMatrixIndex outputs indices where the similarity between ith and jth element 
# stands in the similarity vector
#
# Input: 
## N - number of elements
## i - one index
## j - several indices
# Output:
## Vector of indices where the similarities between ith element and jth element stand.
getMatrixIndex = function(N, i, j) {
  if (min(j) < i) {
    x1 = j[j < i]
    l = N * (x1 - 1) - x1 * (x1 - 1)/2 + i - x1
  } else {
    l = NULL
  }
  
  if (max(j) > i) {
    x2 = j[j > i]
    u = N*(i - 1) - i*(i - 1) / 2 + x2 - i
  } else {
    u = NULL
  }
  return(c(l, u))
}


## End of hclust ##############################################################
###############################################################################

## igraph #####################################################################
###############################################################################

# igraph calculates clustering by each connected subgraph
# 
# Input: 
## S - Similarity vector
## csim - positive real number
# Output:
## Vector, ith element represents cluster id of ith element 

igraph = function(S, csim = 0.7){
  
  k = which(S > csim)
  n = as.integer(0.5 + sqrt(0.25 + 2*length(S))) 
  
  if (length(k) == 0) {
    res = 1:n
  } else {
    fromto = sapply(k, function(x) k2ij(x, n))
    nodes = paste0("A", 1:n)
    links = data.frame(from = nodes[fromto[1,]], to = nodes[fromto[2,]])
    net = graph_from_data_frame(d = links, vertices = nodes, directed = F) 
    cluster = igraph::clusters(net)
    res = cluster$membership
  } 
  
  names(res) = NULL
  return(res)
}

# k2ij calculates row and column index in matrix given vector
# 
# Input: 
## k -
## n - length
# Output:
## Vector of length 2, first row i and second column j.

k2ij = function(k, n){
  x = 1:(n-1)
  j = min(which(k <= x*(n - x/2 - 0.5)))
  if(j == 1) {i = k + 1}
  else { i = k - n*j + n + j^2/2 - j/2 + j}
  return(c(i, j))
}


## End of igraph ##############################################################
###############################################################################

## N-Cluster (Master thesis of Karin Schork ###################################
###############################################################################

# neighbourClust calculates Neighbor Clustering
# 
# Input:
## S - similarity vector
## c - positive real number
# Output:
## Vector, ith element represents cluster id of ith element 
neighbourClust = function(S, c) {
  n = length(S)
  N = calculateN(n)
  
  # Calculate the number of neighbors for each element
  nr.neigh = countNeighbours(S, N, c)
  
  # Elements with only one neighbor are singletons (clust = 0)
  clust = rep(NA, N)
  clust[nr.neigh == 1] = 0
  
  # elements with more than one neighbor
  # ID: elements that are in the race
  ID = which(nr.neigh != 1)
  
  clustID = 1
  while (length(ID) > 0) {
    # element with most neighbors
    ID.max = ID[which.max(nr.neigh[ID])]
    # neighbors of that element
    ID.neigh = ID[which(getMatrixEntry(S, ID.max, ID, N = N, dist = FALSE) >= 1 - c)]
    # Assign all neighbors of that element to a Cluster 
    clust[ID.neigh] = clustID
    
    # Reduce nr.neigh for all neighbors of ID.neigh in ID
    for (j in ID.neigh) {
      neigh = ID[which(getMatrixEntry(S, j, ID, N = N, dist = FALSE) >= 1 - c)]
      nr.neigh[neigh] = nr.neigh[neigh] - 1
    }
    
    # ID: elements, not assigned to a cluster
    ID = which(is.na(clust))
    
    clustID = clustID + 1
  }
  
  labels = clust
  clmax = max(clust)
  labels[clust == 0] <- (clmax + 1):(clmax + sum(clust == 0))
  return(labels)
}

## End of N-Cluster ###########################################################
###############################################################################

## Warning! MS-Cluster and PRIDE Cluster only executable via ?command line?

## MS-Cluster #################################################################
###############################################################################

# mscluster is an R wrapper (BBmisc::system3) for MS-Cluster v2 algorithm,
# an executable that can be run from the command line.
# Frank et al. Spectral archives: extending spectral libraries to analyze both 
# identified and unidentified spectra. Nat. Methods 2011, 8, 587-591.
# 
# Input:
## input              - character vector (max length 2), file to MS/MS runs (mgf-files)
## similarity         - similarity in [0,1], threshold in final round
## window             - real number, --window <X>, default X=2.0 Da. 
##                      window width for clustering
## fragment.tolerance - real number, --fragment-tolerance <X>, the tolerance in 
##                      Da for fragment peaks (default X=0.34 Da.)
## mixture.prob       - probability, --mixture-prob <X>, the probability wrongfully 
##                      adding a spectrum to cluster (default X=0.05).
## rounds             - natural number, --num-rounds<X>,  number of rounds.
## path               - path to MSCluster_bin.
## n1,n2,n3           - number of spectra of first/second/third MS/MS run
# Output:
## Vector, ith element represents cluster id of ith element 

msclusternew = function(input, similarity, window, fragment.tolerance, mixture.prob, rounds, 
                        path = "~/MSCluster.20110327/MSCluster_bin",
                        n1 = 0, n2 = 0, n3 = 0) {
  
  library(BBmisc)
  library(checkmate)
  
  assertFile(input)
  assertNumber(similarity, lower = 0, upper = 1)
  assertNumber(window)
  assertNumber(fragment.tolerance, lower = 0)
  assertNumber(mixture.prob, lower = 0, upper = 1)
  assertFile(path)
  assertNumber(rounds)
  
  
  # create input list 
  input = normalizePath(input)
  input.list = tempfile("input", fileext = ".txt")
  writeLines(input, con = input.list)
  
  # create output directory
  output = tempfile("output")
  dir.create(output)
  
  # model dir
  model.dir = tempfile("models")
  dir.create(model.dir)
  
  args = c(
    sprintf("--list %s", shQuote(input.list)),
    sprintf("--similarity %f", similarity),
    sprintf("--out-dir %s", shQuote(output)),
    sprintf("--output-name %s", "out"),
    sprintf("--tmp-dir %s", shQuote(tempdir())),
    sprintf("--model-dir %s", shQuote(file.path(dirname(path), "Models"))),
    sprintf("--window %f", window),
    sprintf("--fragment-tolerance %f", fragment.tolerance),
    sprintf("--mixture-prob %f", mixture.prob),
    sprintf("--num-rounds %f", rounds)
  )
  
  system3(normalizePath(path), args, wait = TRUE, stop.on.exit.code = TRUE)
  
  readBlock = function(lines) {
    list(
      header = strsplit(lines[1], "\t", fixed = TRUE)[[1L]],
      data = read.table(text = paste(lines, collapse = "\n"), sep = "\t", 
                        skip = 1)
    )
  }
  
  res = list()
  fns = list.files(file.path(output, "clust"), pattern = "\\.clust$", 
                   full.names = TRUE)
  for (fn in fns) {
    lines = readLines(fn)
    grp = cumsum(!nzchar(lines))
    keep = nzchar(lines)
    lines = unname(split(lines[keep], grp[keep]))
    
    res = c(res, lapply(lines, readBlock))
  }
  
  if(length(input) == 1) {
    clust = lapply(res, function(x) x$data$V3)}
  if(length(input) == 2) {
    clust = lapply(res, function(x) c(0,n1)[(x$data$V2 + 1)] + x$data$V3)}
  if(length(input) == 3) {
    clust = lapply(res, function(x) c(0,n1,n1+n2)[(x$data$V2 + 1)] + x$data$V3)}
  
  labels = rep(NA, n1 + n2 + n3)
  
  for(i in 1:length(clust)){
    labels[clust[[i]]] = i
  }
  
  return(labels)
}

## End of MS-Cluster ##########################################################
###############################################################################

## PRIDE Cluster ##############################################################
###############################################################################

# pridecluster is an R wrapper (BBmisc::system3) spectra-cluster-cli,
# a stand-alone Java application of the algorithm.
# Griss, et al. Recognizing millions of consistently unidentified spectra across 
# hundreds of shotgun proteomics datasets. Nat. Methods 2016, 13, 651-656.
# 
# Input:
## input               - character vector (max length 2), file to MS/MS run (mgf-file)
## fragment.tolerance  - positive real number, -fragment_tolerance, fragment ion 
##                       tolerance in m/z ro use for fragment peak matching 
## precursor.tolerance - positive real number, -precursor_tolerance <arg>, precursor 
##                       tolerance (clustering window size) in m/z used during matching
## rounds              - natural number, -rounds <arg>, number of clustering rounds to use
## threshold.end       - -threshold_end <arg>, (lowest) final clustering threshold
## threshold.start     - -threshold_start <arg>, (highest) starting threshold
## path                - path to spectra-cluster-cli-1.0.1.jar
## n1, n2, n3          - number of spectra of first/second/third MS/MS run
# Output:
## list with entries clu and consensus, 
### clu is a vector, ith element represents cluster id of ith element
### consensus is a list of consensus spectra.

prideclusternew = function(input, fragment.tolerance, precursor.tolerance, rounds, 
                           threshold.end, threshold.start, 
                           path = "~/PRIDE/spectra-cluster-cli-1.0.1-bin/spectra-cluster-cli-1.0.1.jar",
                           n1 = 0, n2 = 0, n3 = 0) {
  library(BBmisc)
  library(checkmate)
  name = unlist(strsplit(input, ".mgf"))
  name = strsplit(name, "/")
  name = unlist(lapply(name, function(x) x[length(name[[1]])]))
  assertFile(input) 
  assertFile(path)
  assertNumber(fragment.tolerance, lower = 0)
  assertNumber(precursor.tolerance, lower = 0)
  assertNumber(rounds, lower = 1)
  assertNumber(threshold.end, lower = 0, upper = 1)
  assertNumber(threshold.start, lower = 0, upper = 1)
  
  
  path = normalizePath(path)
  input = normalizePath(input)
  
  # create output directory
  output = tempfile("output")
  dir.create(output)
  
  args = c(
    sprintf("-jar %s", path),
    sprintf("-output_path %s", shQuote(file.path(output, "result.clustering"))),
    sprintf("-fragment_tolerance %f", fragment.tolerance),
    sprintf("-precursor_tolerance %f", precursor.tolerance),
    sprintf("-rounds %s", rounds),
    sprintf("-threshold_end %f", threshold.end),
    sprintf("-threshold_start %f", threshold.start),
    sprintf(paste0(input, collapse = " "))
  )
  
  
  system3("java", args, wait = TRUE, stop.on.exit.code = TRUE)
  
  dat = readLines(file.path(output, "result.clustering"))
  
  clu = which(dat == "=Cluster=")
  n = length(clu)
  
  res = vector("list", n)
  res2 = vector("list", n)
  for(i in 1:(n - 1)){
    clui = dat[clu[i]:(clu[i+1]-1)] 
    speci = clui[which(grepl("SPEC", clui))]
    mzi = clui[which(grepl("consensus_mz=", clui))]
    mzi = as.numeric(unlist(strsplit(strsplit(mzi, "consensus_mz=")[[1]][2],","))) 
    inti = clui[which(grepl("consensus_intens=", clui))]
    inti = as.numeric(unlist(strsplit(strsplit(inti, "consensus_intens=")[[1]][2],","))) 
    res[[i]] = sapply(speci, function(x){
      if(grepl(name[1], x)) res = as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
      if(length(name) == 2){
        if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
      }
      if(length(name) == 3){
        if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
        if(grepl(name[3], x)) res = n1 + n2 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
      }
      return(res)
    }, USE.NAMES = F)
    
    res2[[i]] = list(mass = mzi, int = inti)
  }
  # nth cluster
  
  clui = dat[clu[n]:length(dat)] 
  speci = clui[which(grepl("SPEC", clui))]
  mzi = clui[which(grepl("consensus_mz=", clui))]
  mzi = as.numeric(unlist(strsplit(strsplit(mzi, "consensus_mz=")[[1]][2],","))) 
  inti = clui[which(grepl("consensus_intens=", clui))]
  inti = as.numeric(unlist(strsplit(strsplit(inti, "consensus_intens=")[[1]][2],","))) 
  res[[n]] = sapply(speci, function(x){
    if(grepl(name[1], x)) res = as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1]) 
    if(length(name) == 2){
      if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
    }
    if(length(name) == 3){
      if(grepl(name[2], x)) res = n1 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
      if(grepl(name[3], x)) res = n1 + n2 + as.numeric(strsplit(strsplit(x, "index=")[[1]][2],"#")[[1]][1])
    }
    return(res)
  }, USE.NAMES = F)
  res2[[n]] = list(mass = mzi, int = inti)
  
  clu = rep(NA, n1 + n2 + n3)
  for(i in 1:length(res)){
    clu[res[[i]]] = i
  }
  
  list(clu = clu, consensus = res2)
  
}


## End of PRIDE Cluster #######################################################
###############################################################################


###############################################################################
## Measures for evaluation of clusterings #####################################
###############################################################################

## Adjusted Rand index (ARI) ##################################################
###############################################################################
#
# adjustedRandIndex calculates the Adjusted Rand Index
# 
# Input:
## cluA - vector, ith element represents cluster id of ith element (clust. A)
## cluB - vector, ith element represents cluster id of ith element (clust. B)
# Output:
## numeric, Adjusted Rand Index: Transformation of Rand Index, the fraction of 
## pairs of clustered objects being classified both same or both different

adjustedRandIndex = function(cluA, cluB) {
  tab = comembership_table(cluA, cluB)
  a = as.numeric(tab$n_11)
  b = as.numeric(tab$n_01)
  c = as.numeric(tab$n_10)
  d = choose(length(cluA), 2) - a - b - c
  
  x = 2 * (a * d - b * c)
  y = b ^ 2 + c ^ 2 + 2 * a * d + (a + d) * (c + b)
  ARI = x/y
  return(ARI)
}

## End of Adjusted Rand index (ARI) ###########################################
###############################################################################

## Purity #####################################################################
###############################################################################
#
# Purity calculates the Purity of ith cluster
# Input
## clu - vector, ith element represents cluster id of ith element 
## ann - vector, ith element represents peptide annotation of ith element
## j   - cluster id
# Output:
## numeric, Relative frequency of the most frequent annotation in ith cluster

Purity = function(j, clu, ann){
  
  tab = table(ann[which(clu == j)])
  res = max(tab)/sum(tab)
  names(res) = paste0(names(which(tab == max(tab))), collapse = ", ")
  
  return(res)
}

## End of Purity ##############################################################
###############################################################################

## Proportion of spectra remaining ############################################
###############################################################################
#
# Spec_rem calculates the Proportion of spectra remaining
# Input
## clu - vector, ith element represents cluster id of ith element 
# Output:
## numeric, Number of clusters relative to the number of spectra

Spec_rem = function(clu){
  res = length(table(clu))/length(clu)
  res
}

## End of Proportion of spectra remaining #####################################
###############################################################################

## Retainment of identified spectra ##########################################
##############################################################################
#
# Ret_id_spec calculates the Retainment of identified spectra
# Input
## clu - vector, ith element represents cluster id of ith element 
## ann - vector, ith element represents peptide annotation of ith element
# Output:
## numeric, Number of different annotations regarding the most common 
## annotation in each cluster relative to the total number of annotations

Ret_id_spec = function(clu, ann){
  clu = clu[which(!is.na(ann))]
  ann = ann[which(!is.na(ann))]
  ids = unique(clu)
  ann_com = vector("list", length(ids))
  for(i in 1:length(ids)){
    ann_com[[i]] = names(which.max(table(ann[which(clu == ids[i])])))
  }
  
  res = length(unique(unlist(ann_com)))/length(unique(ann))
  return(res)
}

## End of Retainment of identified spectra ####################################
###############################################################################


## Proportion clustered spectra ##############################################
##############################################################################
# Prop_clu_spec calculates the proportion of clustered spectra
# Input
## clu - vector, ith element represents cluster id of ith element 
# Output:
## numeric, Proportion of spectra clustered with at least one other spectrum

Prop_clu_spec = function(clu){
  res = 1 - sum(table(clu) == 1)/length(clu)
  res
}

## End of Proportion clustered spectra ########################################
###############################################################################

## Proportion incorrectly clustered spectra ###################################
###############################################################################
# Prop_inc_spec calculates the proportion of incorrectly clustered spectra
# Input
## clu - vector, ith element represents cluster id of ith element 
## ann - vector, ith element represents peptide annotation of ith element
# Output:
## numeric, Proportion of spectra not identified as the most common annotation 
## in the cluster

Prop_inc_spec = function(clu, ann){
  clu = clu[which(!is.na(ann))]
  ann = ann[which(!is.na(ann))]
  ids = unique(clu)
  inc_spec = numeric(length(ids))
  for(i in 1:length(ids)){
    tab = table(ann[which(clu == ids[i])])
    inc_spec[i] = sum(tab) - max(tab)
  }
  res = sum(inc_spec)/length(ann)
  return(res)
}

## End of Proportion incorrectly clustered spectra #############################
################################################################################