diff --git a/.project b/.project deleted file mode 100644 index a807d01..0000000 --- a/.project +++ /dev/null @@ -1,19 +0,0 @@ - - - ccaPP - - - - - - de.walware.statet.r.builders.RSupport - - - - - - de.walware.statet.base.StatetNature - de.walware.statet.r.RNature - de.walware.statet.r.RPkgNature - - diff --git a/DESCRIPTION b/DESCRIPTION index 91fbad3..c1fccdf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ccaPP Type: Package Title: (Robust) Canonical Correlation Analysis via Projection Pursuit -Version: 0.3.3 -Date: 2019-12-09 +Version: 0.4.0 +Date: 2021-10-14 Depends: R (>= 3.2.0), parallel, @@ -26,4 +26,4 @@ Author: Andreas Alfons [aut, cre], David Simcha [ctb] Maintainer: Andreas Alfons Encoding: UTF-8 -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.2 diff --git a/NAMESPACE b/NAMESPACE index cec708f..596a150 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,10 @@ # Generated by roxygen2: do not edit by hand S3method(print,cca) +S3method(print,cvFolds) S3method(print,maxCor) S3method(print,permTest) +S3method(print,sMaxCor) export(CCAgrid) export(CCAproj) export(ccaGrid) @@ -12,11 +14,13 @@ export(corM) export(corPearson) export(corQuadrant) export(corSpearman) +export(cvFolds) export(fastMAD) export(fastMedian) export(maxCorGrid) export(maxCorProj) export(permTest) +export(sMaxCorGrid) import(parallel) import(pcaPP) import(robustbase) diff --git a/NEWS b/NEWS index e524dc1..9c741b7 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +Changes in ccaPP version 0.4.0 + + + New function sMaxCorGrid() for sparse maximum correlation estimators. + + + Changes in ccaPP version 0.3.3 + C++ functions are now properly registered. diff --git a/R/cca.R b/R/cca.R index c791338..395e3fd 100644 --- a/R/cca.R +++ b/R/cca.R @@ -1,7 +1,7 @@ -# ------------------------------------ +# -------------------------------------- # Author: Andreas Alfons -# Erasmus University Rotterdam -# ------------------------------------ +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' (Robust) CCA via alternating series of grid searches #' @@ -154,7 +154,33 @@ ccaGrid <- function(x, y, k = 1, cca } -## wrapper function for more compatibility with package pcaPP + +## @rdname ccaGrid +## @export + +#sccaGrid <- function(x, y, k = 1, lambda = 0, +# method = c("spearman", "kendall", "quadrant", "M", "pearson"), +# control = list(...), nIterations = 10, nAlternate = 10, nGrid = 25, +# select = NULL, tol = 1e-06, fallback = FALSE, seed = NULL, ...) { +# ## initializations +# matchedCall <- match.call() +# ## define list of control arguments for algorithm +# lambda <- rep(as.numeric(lambda), length.out=2) +# lambda[is.na(lambda)] <- formals()$lambda +# nIterations <- as.integer(nIterations) +# nAlternate <- as.integer(nAlternate) +# nGrid <- as.integer(nGrid) +# tol <- as.numeric(tol) +# ppControl <- list(lambda=lambda, nIterations=nIterations, +# nAlternate=nAlternate, nGrid=nGrid, select=select, tol=tol) +# ## call workhorse function +# cca <- ccaPP(x, y, k, method=method, corControl=control, algorithm="sparse", +# ppControl=ppControl, fallback=fallback, seed=seed) +# cca$call <- matchedCall +# cca +#} + + #' @rdname ccaGrid #' @export @@ -175,6 +201,24 @@ CCAgrid <- function(x, y, k = 1, } +## @rdname ccaGrid +## @export + +#sCCAgrid <- function(x, y, k = 1, lambda = 0, +# method = c("spearman", "kendall", "quadrant", "M", "pearson"), +# maxiter = 10, maxalter = 10, splitcircle = 25, select=NULL, +# zero.tol = 1e-06, fallback = FALSE, seed = NULL, ...) { +# ## initializations +# matchedCall <- match.call() +# ## call ccaGrid() +# cca <- sccaGrid(x, y, k=k, lambda=lambda, method=method, nIterations=maxiter, +# nAlternate=maxalter, nGrid=splitcircle, select=select, tol=zero.tol, +# fallback=fallback, seed=seed, ...) +# cca$call <- matchedCall +# cca +#} + + #' (Robust) CCA via projections through the data points #' #' Perform canoncial correlation analysis via projection pursuit based on @@ -280,7 +324,7 @@ ccaProj <- function(x, y, k = 1, cca } -## wrapper function for more compatibility with package pcaPP + #' @rdname ccaProj #' @export diff --git a/R/cor.R b/R/cor.R index f7f3fdb..261dfa3 100644 --- a/R/cor.R +++ b/R/cor.R @@ -1,7 +1,7 @@ -# ---------------------- +# -------------------------------------- # Author: Andreas Alfons -# KU Leuven -# ---------------------- +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' Fast implementations of (robust) correlation estimators #' diff --git a/R/cv.R b/R/cv.R new file mode 100644 index 0000000..353a5af --- /dev/null +++ b/R/cv.R @@ -0,0 +1,169 @@ +# -------------------------------------- +# Author: Andreas Alfons +# Erasmus Universiteit Rotterdam +# -------------------------------------- + +## perform cross-validation +cvMaxCor <- function(call, x, y, folds, corFun = corSpearman, corArgs = list(), + envir = parent.frame(), cl = NULL) { + # initializations + R <- folds$R + useParallel <- !is.null(cl) + # obtain list of predictions for all replications + if(R == 1) { + s <- getIndices(folds) + if(useParallel) xy <- pcvXY(s, call=call, x=x, y=y, envir=envir, cl=cl) + else xy <- cvXY(s, call=call, x=x, y=y, envir=envir) + corXY(xy, corFun=corFun, corArgs=corArgs) + } else { + cvFun <- function(r) { + s <- getIndices(folds, r) + xy <- cvXY(s, call=call, x=x, y=y, envir=envir) + corXY(xy, corFun=corFun, corArgs=corArgs) + } + if(useParallel) { + cv <- parLapply(cl, seq_len(R), cvFun) + cv <- do.call(cbind, cv) + } + else cv <- sapply(seq_len(R), cvFun) + apply(cv, 1, mean) + } +} + +# compute the weighting vectors for the training data and obtain linear +# combinations of the test data +rsXY <- function(i, call, x, y, envir) { + # plug training data into function call + call$x <- x[-i, , drop=FALSE] + call$y <- y[-i, , drop=FALSE] + # evaluate function call in supplied environment to make sure + # that other arguments are evaluated correctly + fit <- eval(call, envir) + # obtain linear combinations for test data + cbind(x=x[i, , drop=FALSE] %*% fit$a, y=y[i, , drop=FALSE] %*% fit$b) +} + +# compute linear combinations for each left-out block in one replication of +# cross validation +cvXY <- function(folds, call, x, y, envir) { + tmp <- lapply(folds, rsXY, call=call, x=x, y=y, envir=envir) + # instead of collecting the results from the folds in the original order + # of the observations, they are simply stacked on top of each other + do.call(rbind, tmp) +} + +# compute linear combinations for each left-out block in one replication of +# cross validation via parallel computing +pcvXY <- function(folds, call, x, y, envir, cl) { + # parLapply() already has an argument 'x', so don't use the argument names + # for the data or it will throw an error + tmp <- parLapply(cl, folds, rsXY, call, x, y, envir=envir) + # instead of collecting the results from the folds in the original order + # of the observations, they are simply stacked on top of each other + do.call(rbind, tmp) +} + +# compute the correlation between the linear combinations obtained via +# cross-validation +corXY <- function(xy, corFun = corSpearman, corArgs = list()) { + # number of linear combinations of each data set + p <- ncol(xy) / 2 + # split the data into the linear combinations of x and y, respectively, and + # compute the correlation between each pair of linear combinations + if(p == 1) doCall(corFun, xy[, 1], xy[, 2], args=corArgs) + else { + sapply(seq_len(p), function(j) { + doCall(corFun, xy[, j], xy[, p+j], args=corArgs) + }) + } +} + + +#' Cross-validation folds +#' +#' Split observations or groups of observations into \eqn{K} folds to be used +#' for (repeated) \eqn{K}-fold cross-validation. \eqn{K} should thereby be +#' chosen such that all folds are of approximately equal size. +#' +#' @aliases print.cvFolds +#' +#' @param n an integer giving the number of observations to be split into +#' folds. This is ignored if \code{grouping} is supplied in order to split +#' groups of observations into folds. +#' @param K an integer giving the number of folds into which the observations +#' should be split (the default is five). Setting \code{K} equal to the number +#' of observations or groups yields leave-one-out cross-validation. +#' @param R an integer giving the number of replications for repeated +#' \eqn{K}-fold cross-validation. This is ignored for for leave-one-out +#' cross-validation and other non-random splits of the data. +#' @param type a character string specifying the type of folds to be +#' generated. Possible values are \code{"random"} (the default), +#' \code{"consecutive"} or \code{"interleaved"}. +#' @param grouping a factor specifying groups of observations. If supplied, +#' the data are split according to the groups rather than individual +#' observations such that all observations within a group belong to the same +#' fold. +#' +#' @return An object of class \code{"cvFolds"} with the following components: +#' \item{n}{an integer giving the number of observations or groups.} +#' \item{K}{an integer giving the number of folds.} +#' \item{R}{an integer giving the number of replications.} +#' \item{subsets}{an integer matrix in which each column contains a permutation +#' of the indices of the observations or groups.} +#' \item{which}{an integer vector giving the fold for each permuted observation +#' or group.} +#' \item{grouping}{a list giving the indices of the observations belonging to +#' each group. This is only returned if a grouping factor has been supplied.} +#' +#' @author Andreas Alfons +#' +#' @seealso \code{\link{sMaxCorGrid}} +#' +#' @examples +#' set.seed(1234) # set seed for reproducibility +#' cvFolds(20, K = 5) +#' cvFolds(20, K = 5, R = 10) +#' +#' @keywords utilities +#' +#' @export + +cvFolds <- function(n, K = 5, R = 1, + type = c("random", "consecutive", "interleaved"), + grouping = NULL) { + # check arguments + n <- if(is.null(grouping)) round(rep(n, length.out=1)) else nlevels(grouping) + if(!isTRUE(n > 0)) stop("'n' must be positive") + if(!isTRUE(K <= n)) stop(sprintf("'K' must be smaller or equal to %d", n)) + if(K == n) type <- "leave-one-out" + else type <- match.arg(type) + # obtain CV folds + if(type == "random") { + # random K-fold splits with R replications + subsets <- replicate(R, sample.int(n)) + } else { + # leave-one-out CV or non-random splits, replication not meaningful + R <- 1 + subsets <- as.matrix(seq_len(n)) + } + which <- as.factor(rep(seq_len(K), length.out=n)) + if(type == "consecutive") which <- rep.int(seq_len(K), tabulate(which)) + # construct and return object + folds <- list(n=n, K=K, R=R, subsets=subsets, which=which) + if(!is.null(grouping)) folds$grouping <- split(seq_along(grouping), grouping) + class(folds) <- "cvFolds" + folds +} + +# retrieve list of indices for r-th replication +getIndices <- function(x, r = 1) { + # split permuted items according to the folds + subsets <- split(x$subsets[, r], x$which) + # in case of grouped data, the list contains the group indices in each CV + # fold, so the indices of the respective observations need to be extracted + if(!is.null(grouping <- x$grouping)) + subsets <- lapply(subsets, function(s) unlist(grouping[s], use.names=FALSE)) + # return list of indices for CV folds + names(subsets) <- NULL + subsets +} diff --git a/R/fastMAD.R b/R/fastMAD.R index c4fadb8..2a2fac5 100644 --- a/R/fastMAD.R +++ b/R/fastMAD.R @@ -1,7 +1,7 @@ -# ---------------------- +# -------------------------------------- # Author: Andreas Alfons -# KU Leuven -# ---------------------- +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' Fast implementation of the median absolute deviation #' diff --git a/R/fastMedian.R b/R/fastMedian.R index 52f8a75..f3d29f0 100644 --- a/R/fastMedian.R +++ b/R/fastMedian.R @@ -1,7 +1,7 @@ -# ---------------------- +# -------------------------------------- # Author: Andreas Alfons -# KU Leuven -# ---------------------- +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' Fast implementation of the median #' diff --git a/R/maxCor.R b/R/maxCor.R index 29c57b3..4f2ba3b 100644 --- a/R/maxCor.R +++ b/R/maxCor.R @@ -1,7 +1,7 @@ -# ------------------------------------ +# -------------------------------------- # Author: Andreas Alfons -# Erasmus University Rotterdam -# ------------------------------------ +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' (Robust) maximum correlation via alternating series of grid searches #' @@ -37,9 +37,19 @@ #' absolute correlation with the respective other data set to ensure symmetry #' of the algorithm. #' +#' \code{sMaxCorGrid} enforces sparsity by adding \eqn{L_{1}}{L1} penalties on +#' the weighting vectors to the objective function (i.e., the correlation of +#' the resulting projections). The optimal combination of penalty paramters is +#' thereby determined via cross-validation over a grid of values. To increase +#' the computational performance of this cross-validation procedure, parallel +#' computing is implemented via package \pkg{parallel}. +#' #' @aliases print.maxCor #' #' @param x,y each can be a numeric vector, matrix or data frame. +#' @param lambdaX,lambdaY numeric vectors of non-negative values giving the +#' penalty parameters for the weighting vectors of \code{x} and \code{y}, +#' respectively. #' @param method a character string specifying the correlation functional to #' maximize. Possible values are \code{"spearman"} for the Spearman #' correlation, \code{"kendall"} for the Kendall correlation, \code{"quadrant"} @@ -74,6 +84,19 @@ #' zero (e.g., dummy variables) are standardized via mean and standard #' deviation. Note that if the Pearson correlation is maximized, #' standardization is always done via mean and standard deviation. +#' @param K,R,type,grouping additional arguments for generating +#' cross-validation folds (see \code{\link{cvFolds}}). +#' @param folds an object of class \code{"cvFolds"} (as returned by +#' \code{\link{cvFolds}}) defining the folds of the data for cross-validation. +#' If supplied, this is preferred over the arguments for generating +#' cross-validation folds. +#' @param nCores a positive integer giving the number of processor cores to be +#' used for parallel computing in cross-validation (the default is 1 for no +#' parallelization). If this is set to \code{NA}, all available processor +#' cores are used. +#' @param cl a \pkg{parallel} cluster for parallel computing to be used in +#' cross-validation, as generated by \code{\link[parallel]{makeCluster}}. If +#' supplied, this is preferred over \code{nCores}. #' @param seed optional initial seed for the random number generator (see #' \code{\link{.Random.seed}}). This is only used if \code{select} specifies #' the numbers of variables of each data set to be randomly selected for @@ -93,6 +116,16 @@ #' standardization of \code{x}.} #' \item{scaleY}{a numeric vector giving the scale estimates used in #' standardization of \code{y}.} +#' \item{lambdaX}{a numeric giving the optimal penalty parameter for the +#' weighting vector of \code{x} (only \code{sMaxCorGrid}).} +#' \item{lambdaY}{a numeric giving the optimal penalty parameter for the +#' weighting vector of \code{y} (only \code{sMaxCorGrid}).} +#' \item{objective}{a numeric giving the value of the objective function +#' for the optimal combination of penalty parameters (only \code{sMaxCorGrid}).} +#' \item{cv}{a numeric matrix containing the (average) results from +#' cross-validation for each combination of penalty parameters (only +#' \code{sMaxCorGrid} and if a grid of values for the penalty parameters has +#' been supplied).} #' \item{call}{the matched function call.} #' #' @author Andreas Alfons @@ -106,8 +139,8 @@ #' Statistics}, \bold{45}(1), 71--79. #' #' A. Alfons, C. Croux and P. Filzmoser (2016) Robust maximum association -#' estimators. \emph{Journal of the American Statistical Association}. DOI -#' 10.1080/01621459.2016.1148609. In press. +#' estimators. \emph{Journal of the American Statistical Association}. +#' \bold{112}(517), 436--445. #' #' @examples #' data("diabetes") @@ -152,6 +185,41 @@ maxCorGrid <- function(x, y, } +#' @rdname maxCorGrid +#' @export + +sMaxCorGrid <- function(x, y, lambdaX = 0, lambdaY = 0, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + control = list(...), nIterations = 10, nAlternate = 10, + nGrid = 25, select = NULL, tol = 1e-06, + standardize = TRUE, fallback = FALSE, K = 5, R = 1, + type = c("random", "consecutive", "interleaved"), + grouping = NULL, folds = NULL, nCores = 1, cl = NULL, + seed = NULL, ...) { + ## initializations + matchedCall <- match.call() + ## define list of control arguments for algorithm + lambdaX <- as.numeric(lambdaX) + lambdaX[is.na(lambdaX)] <- formals()$lambdaX + lambdaY <- as.numeric(lambdaY) + lambdaY[is.na(lambdaY)] <- formals()$lambdaY + nIterations <- as.integer(nIterations) + nAlternate <- as.integer(nAlternate) + nGrid <- as.integer(nGrid) + tol <- as.numeric(tol) + ppControl <- list(lambdaX=lambdaX, lambdaY=lambdaY, nIterations=nIterations, + nAlternate=nAlternate, nGrid=nGrid, select=select, tol=tol) + ## call workhorse function + maxCor <- sMaxCorPP(x, y, method=method, corControl=control, + algorithm="grid", ppControl=ppControl, + standardize=standardize, fallback=fallback, K=K, + R=R, type=type, grouping=grouping, folds=folds, + nCores=nCores, cl=cl, seed=seed) + maxCor$call <- matchedCall + maxCor +} + + #' (Robust) maximum correlation via projections through the data points #' #' Compute the maximum correlation between two data sets via projection pursuit @@ -246,7 +314,7 @@ maxCorProj <- function(x, y, } -## workhorse function +## workhorse function for maximum correlation maxCorPP <- function(x, y, ...) { ## call workhorse function for canonical correlation analysis maxCor <- ccaPP(x, y, forceConsistency=FALSE, ...) @@ -257,3 +325,103 @@ maxCorPP <- function(x, y, ...) { class(maxCor) <- "maxCor" maxCor } + + +## workhorse function for sparse maximum correlation +#' @import parallel +sMaxCorPP <- function(x, y, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + corControl, algorithm = "grid", ppControl, + standardize = TRUE, fallback = FALSE, folds = NULL, + nCores = 1, cl = NULL, seed = NULL, ...) { + ## initializations + x <- as.matrix(x) + y <- as.matrix(y) + n <- nrow(x) + if(nrow(y) != n) stop("'x' and 'y' must have the same number of observations") + p <- ncol(x) + q <- ncol(y) + ## prepare the data and call C++ function + if(n == 0 || p == 0 || q == 0) { + # zero dimension + maxCor <- list(cor=NA, a=numeric(), b=numeric(), lambdaX=NA, + lambdaY=NA, objective=NA) + } else { + # get list of control arguments + method <- match.arg(method) + corControl <- getCorControl(method, corControl, forceConsistency=FALSE) + standardize <- isTRUE(standardize) + fallback <- isTRUE(fallback) + if(!is.null(seed)) set.seed(seed) + ppControl <- getPPControl(algorithm, ppControl, p=p, q=q) + # check grid of values for the penalty parameters + lambdaX <- ppControl$lambdaX + lambdaY <- ppControl$lambdaY + ppControl$lambdaX <- NULL + ppControl$lambdaY <- NULL + lambdaX <- if(p == 1) 0 else sort(unique(lambdaX)) + lambdaY <- if(q == 1) 0 else sort(unique(lambdaY)) + lambda <- as.matrix(expand.grid(lambdaX=lambdaX, lambdaY=lambdaY)) + ppControl$lambda <- lambda + # check whether cross-validation over grid of tuning parameters is necessary + useCV <- nrow(lambda) > 1 + if(useCV) { + if(is.null(folds)) folds <- cvFolds(n, ...) + # set up parallel computing if requested + haveCl <- inherits(cl, "cluster") + if(haveCl) haveNCores <- FALSE + else { + if(is.na(nCores)) nCores <- detectCores() # use all available cores + if(!is.numeric(nCores) || is.infinite(nCores) || nCores < 1) { + nCores <- 1 # use default value + warning("invalid value of 'nCores'; using default value") + } else nCores <- as.integer(nCores) + R <- folds$R + nCores <- if(R == 1) min(nCores, folds$K) else min(nCores, R) + haveNCores <- nCores > 1 + } + # check whether parallel computing should be used + useParallel <- haveNCores || haveCl + # set up multicore or snow cluster if not supplied + if(haveNCores) { + if(.Platform$OS.type == "windows") { + cl <- makePSOCKcluster(rep.int("localhost", nCores)) + } else cl <- makeForkCluster(nCores) + on.exit(stopCluster(cl)) + } + # perform cross-validation over grid of tuning parameters + call <- call("sMaxCorPPFit", method=method, corControl=corControl, + algorithm=algorithm, ppControl=ppControl, + standardize=standardize, fallback=fallback) + call[[1]] <- sMaxCorPPFit # necessary to work with parallel computing + corFun <- switch(method, spearman=corSpearman, kendall=corKendall, + quadrant=corQuadrant, M=corM, pearson=corPearson) + cv <- cvMaxCor(call, x, y, folds=folds, corFun=corFun, + corArgs=corControl, cl=cl) + # determine the optimal tuning parameters + ppControl$lambda <- lambda[which.max(cv), , drop=FALSE] + } + # compute the final solution + maxCor <- sMaxCorPPFit(x, y, method=method, corControl=corControl, + algorithm=algorithm, ppControl=ppControl, + standardize=standardize, fallback=fallback) + maxCor$a <- drop(maxCor$a) + maxCor$b <- drop(maxCor$b) + if(useCV) maxCor$cv <- cbind(lambda, CV=cv) + } + ## assign class and return results + class(maxCor) <- c("sMaxCor", "maxCor") + maxCor +} + +## simple wrapper for calling the C++ function +sMaxCorPPFit <- function(x, y, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + corControl, algorithm = "grid", ppControl, + standardize = TRUE, fallback = FALSE) { + # call C++ function + maxCor <- .Call("R_sMaxCorPP", R_x=x, R_y=y, R_method=method, + R_corControl=corControl, R_algorithm=algorithm, + R_ppControl=ppControl, R_standardize=standardize, + R_fallback=fallback, PACKAGE="ccaPP") +} diff --git a/R/permTest.R b/R/permTest.R index 0ebd57b..f9a0ac8 100644 --- a/R/permTest.R +++ b/R/permTest.R @@ -1,7 +1,7 @@ -# ---------------------- +# -------------------------------------- # Author: Andreas Alfons -# KU Leuven -# ---------------------- +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' (Robust) permutation test for no association #' diff --git a/R/print.R b/R/print.R index 840ea76..0e558e8 100644 --- a/R/print.R +++ b/R/print.R @@ -1,7 +1,7 @@ -# ---------------------- +# -------------------------------------- # Author: Andreas Alfons -# KU Leuven -# ---------------------- +# Erasmus Universiteit Rotterdam +# -------------------------------------- #' @export print.cca <- function(x, ...) { @@ -17,6 +17,29 @@ print.cca <- function(x, ...) { invisible(x) } +#' @export +print.cvFolds <- function(x, ...) { + # print general information + cvText <- if(x$n == x$K) "Leave-one-out CV" else sprintf("%d-fold CV", x$K) + if(x$R > 1) cvText <- paste(cvText, "with", x$R, "replications") + cat(paste("\n", cvText, ":", sep="")) + # print information on folds (add space between folds and subsets) + subsets <- x$subsets + if(x$R == 1) { + cn <- if("grouping" %in% names(x)) "Group index" else "Index" + nblanks <- 2 + } else { + cn <- as.character(seq_len(x$R)) + nblanks <- 3 + } + nblanks <- max(nchar(as.character(subsets[, 1]))-nchar(cn[1]), 0) + nblanks + cn[1] <- paste(c(rep.int(" ", nblanks), cn[1]), collapse="") + dimnames(subsets) <- list(Fold=x$which, cn) + print(subsets, ...) + # return object invisibly + invisible(x) +} + #' @export print.maxCor <- function(x, ...) { # print function call @@ -44,3 +67,19 @@ print.permTest <- function(x, ...) { # return object invisibly invisible(x) } + +#' @export +print.sMaxCor <- function(x, ...) { + print.maxCor(x, ...) + # print optimal tuning parameters + if(!is.null(cv <- x$cv)) { + haveGridX <- any(cv[, "lambdaX"] != 0) + haveGridY <- any(cv[, "lambdaY"] != 0) + if(haveGridX && haveGridY) cat("\nOptimal tuning parameters:\n") + else cat("\nOptimal tuning parameter:\n") + if(haveGridX) cat(sprintf("lambdaX: %f\n", x$lambdaX)) + if(haveGridY) cat(sprintf("lambdaY: %f\n", x$lambdaY)) + } + # return object invisibly + invisible(x) +} diff --git a/R/utils.R b/R/utils.R index b300e20..a65f277 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,7 +1,7 @@ -# ---------------------- +# -------------------------------------- # Author: Andreas Alfons -# KU Leuven -# ---------------------- +# Erasmus Universiteit Rotterdam +# -------------------------------------- ### transform canonical vectors back to original scale #backtransform <- function(A, scale) { @@ -18,6 +18,17 @@ checkIndices <- function(indices, max) { indices[which(indices > 0 & indices <= max)] } +## call a function by either +# 1) simply evaluating a supplied function for the basic arguments if there are +# no additional arguments in list format +# 2) evaluating a supplied function with 'do.call' if there are additional +# arguments in list format +doCall <- function(fun, ..., args = list()) { + if(length(args) == 0) { + fun(...) + } else do.call(fun, c(list(...), args)) +} + ## call C++ to compute ranks of observations in a vector (for testing) fastRank <- function(x) { x <- as.numeric(x) @@ -61,6 +72,49 @@ getCorControl <- function(method, control, forceConsistency = TRUE) { out } +## get list of control arguments for projection pursuit algorithm +getPPControl <- function(algorithm, control, p, q, seed = NULL) { + # additional checks for grid search algorithm + if(algorithm == "grid") { + # check subset of variables to be used for determining the order of + # the variables from the respective other data set + select <- control$select + control$select <- NULL + if(!is.null(select)) { + if(is.list(select)) { + # make sure select is a list with two index vectors and + # drop invalid indices from each vector + select <- rep(select, length.out=2) + select <- mapply(function(indices, max) { + indices <- as.integer(indices) + indices[which(indices > 0 & indices <= max)] - 1 + }, select, c(p, q)) + valid <- sapply(select, length) > 0 + # add the two index vectors to control object + if(all(valid)) { + control$selectX <- select[[1]] + control$selectY <- select[[2]] + } else select <- NULL + } else { + # check number of indices to sample + select <- rep(as.integer(select), length.out=2) + valid <- !is.na(select) & select > 0 & select < c(p, q) + if(all(valid)) { + # generate index vectors and add them to control object + if(!is.null(seed)) set.seed(seed) + control$selectX <- sample.int(p, select[1]) - 1 + control$selectY <- sample.int(q, select[2]) - 1 + } else select <- NULL + } + } + if(is.null(select)) { + control$selectX <- control$selectY <- integer() + } + } + # return list of control arguments + control +} + ## L1 median (for testing) l1Median <- function(x) { # initializations diff --git a/inst/CITATION b/inst/CITATION index aca49ca..13e23e2 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -3,7 +3,7 @@ citHeader("To cite package 'ccaPP' in publications use:") citEntry(entry = "Article", title = "Robust maximum association between data sets: The {R} package {ccaPP}", author = personList(as.person("Andreas Alfons"), - as.person("Christophe Croux"), + as.person("Christophe Croux"), as.person("Peter Filzmoser")), journal = "Austrian Journal of Statistics", year = "2016", @@ -20,16 +20,17 @@ citEntry(entry = "Article", citEntry(entry = "Article", title = "Robust maximum association estimators", author = personList(as.person("Andreas Alfons"), - as.person("Christophe Croux"), + as.person("Christophe Croux"), as.person("Peter Filzmoser")), journal = "Journal of the American Statistical Association", - year = "2016", - note = "In press", - doi = "10.1080/01621459.2016.1148609", + year = "2017", + volume = "112", + number = "517", + pages = "436--445", textVersion = - paste("Andreas Alfons, Christophe Croux, Peter Filzmoser (2016).", + paste("Andreas Alfons, Christophe Croux, Peter Filzmoser (2017).", "Robust maximum association estimators.", - "Journal of the American Statistical Association.", - "DOI 10.1080/01621459.2016.1148609. In press.") + "Journal of the American Statistical Association,", + "112(517), 436-445.") ) diff --git a/man/ccaGrid.Rd b/man/ccaGrid.Rd index 74b53fa..2e08759 100644 --- a/man/ccaGrid.Rd +++ b/man/ccaGrid.Rd @@ -6,15 +6,38 @@ \alias{CCAgrid} \title{(Robust) CCA via alternating series of grid searches} \usage{ -ccaGrid(x, y, k = 1, method = c("spearman", "kendall", "quadrant", "M", - "pearson"), control = list(...), nIterations = 10, nAlternate = 10, - nGrid = 25, select = NULL, tol = 1e-06, standardize = TRUE, - fallback = FALSE, seed = NULL, ...) - -CCAgrid(x, y, k = 1, method = c("spearman", "kendall", "quadrant", "M", - "pearson"), maxiter = 10, maxalter = 10, splitcircle = 25, - select = NULL, zero.tol = 1e-06, standardize = TRUE, - fallback = FALSE, seed = NULL, ...) +ccaGrid( + x, + y, + k = 1, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + control = list(...), + nIterations = 10, + nAlternate = 10, + nGrid = 25, + select = NULL, + tol = 1e-06, + standardize = TRUE, + fallback = FALSE, + seed = NULL, + ... +) + +CCAgrid( + x, + y, + k = 1, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + maxiter = 10, + maxalter = 10, + splitcircle = 25, + select = NULL, + zero.tol = 1e-06, + standardize = TRUE, + fallback = FALSE, + seed = NULL, + ... +) } \arguments{ \item{x, y}{each can be a numeric vector, matrix or data frame.} diff --git a/man/ccaPP-package.Rd b/man/ccaPP-package.Rd index ba3e53c..4dc30f2 100644 --- a/man/ccaPP-package.Rd +++ b/man/ccaPP-package.Rd @@ -20,12 +20,12 @@ The DESCRIPTION file: Maintainer: \packageMaintainer{ccaPP} } \references{ -A. Alfons, C. Croux and P. Filzmoser (2016) Robust maximum association between -data sets: The \R Package \pkg{ccaPP}. \emph{Austrian Journal of Statistics}, +A. Alfons, C. Croux and P. Filzmoser (2016) Robust maximum association between +data sets: The \R Package \pkg{ccaPP}. \emph{Austrian Journal of Statistics}, \bold{45}(1), 71--79. -A. Alfons, C. Croux and P. Filzmoser (2016) Robust maximum association -estimators. \emph{Journal of the American Statistical Association}. DOI -10.1080/01621459.2016.1148609. In press. +A. Alfons, C. Croux and P. Filzmoser (2017) Robust maximum association +estimators. \emph{Journal of the American Statistical Association}. +\bold{112}(517), 436--445. } \keyword{package} diff --git a/man/ccaProj.Rd b/man/ccaProj.Rd index 175119a..4df9768 100644 --- a/man/ccaProj.Rd +++ b/man/ccaProj.Rd @@ -5,13 +5,28 @@ \alias{CCAproj} \title{(Robust) CCA via projections through the data points} \usage{ -ccaProj(x, y, k = 1, method = c("spearman", "kendall", "quadrant", "M", - "pearson"), control = list(...), standardize = TRUE, - useL1Median = TRUE, fallback = FALSE, ...) +ccaProj( + x, + y, + k = 1, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + control = list(...), + standardize = TRUE, + useL1Median = TRUE, + fallback = FALSE, + ... +) -CCAproj(x, y, k = 1, method = c("spearman", "kendall", "quadrant", "M", - "pearson"), standardize = TRUE, useL1Median = TRUE, - fallback = FALSE, ...) +CCAproj( + x, + y, + k = 1, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + standardize = TRUE, + useL1Median = TRUE, + fallback = FALSE, + ... +) } \arguments{ \item{x, y}{each can be a numeric vector, matrix or data frame.} diff --git a/man/corFunctions.Rd b/man/corFunctions.Rd index 90c7191..0d118cf 100644 --- a/man/corFunctions.Rd +++ b/man/corFunctions.Rd @@ -17,8 +17,13 @@ corKendall(x, y, consistent = FALSE) corQuadrant(x, y, consistent = FALSE) -corM(x, y, prob = 0.9, initial = c("quadrant", "spearman", "kendall", - "pearson"), tol = 1e-06) +corM( + x, + y, + prob = 0.9, + initial = c("quadrant", "spearman", "kendall", "pearson"), + tol = 1e-06 +) } \arguments{ \item{x, y}{numeric vectors.} diff --git a/man/cvFolds.Rd b/man/cvFolds.Rd new file mode 100644 index 0000000..35fc925 --- /dev/null +++ b/man/cvFolds.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv.R +\name{cvFolds} +\alias{cvFolds} +\alias{print.cvFolds} +\title{Cross-validation folds} +\usage{ +cvFolds( + n, + K = 5, + R = 1, + type = c("random", "consecutive", "interleaved"), + grouping = NULL +) +} +\arguments{ +\item{n}{an integer giving the number of observations to be split into +folds. This is ignored if \code{grouping} is supplied in order to split +groups of observations into folds.} + +\item{K}{an integer giving the number of folds into which the observations +should be split (the default is five). Setting \code{K} equal to the number +of observations or groups yields leave-one-out cross-validation.} + +\item{R}{an integer giving the number of replications for repeated +\eqn{K}-fold cross-validation. This is ignored for for leave-one-out +cross-validation and other non-random splits of the data.} + +\item{type}{a character string specifying the type of folds to be +generated. Possible values are \code{"random"} (the default), +\code{"consecutive"} or \code{"interleaved"}.} + +\item{grouping}{a factor specifying groups of observations. If supplied, +the data are split according to the groups rather than individual +observations such that all observations within a group belong to the same +fold.} +} +\value{ +An object of class \code{"cvFolds"} with the following components: +\item{n}{an integer giving the number of observations or groups.} +\item{K}{an integer giving the number of folds.} +\item{R}{an integer giving the number of replications.} +\item{subsets}{an integer matrix in which each column contains a permutation +of the indices of the observations or groups.} +\item{which}{an integer vector giving the fold for each permuted observation +or group.} +\item{grouping}{a list giving the indices of the observations belonging to +each group. This is only returned if a grouping factor has been supplied.} +} +\description{ +Split observations or groups of observations into \eqn{K} folds to be used +for (repeated) \eqn{K}-fold cross-validation. \eqn{K} should thereby be +chosen such that all folds are of approximately equal size. +} +\examples{ +set.seed(1234) # set seed for reproducibility +cvFolds(20, K = 5) +cvFolds(20, K = 5, R = 10) + +} +\seealso{ +\code{\link{sMaxCorGrid}} +} +\author{ +Andreas Alfons +} +\keyword{utilities} diff --git a/man/maxCorGrid.Rd b/man/maxCorGrid.Rd index 8933c84..5c4b93d 100644 --- a/man/maxCorGrid.Rd +++ b/man/maxCorGrid.Rd @@ -3,12 +3,49 @@ \name{maxCorGrid} \alias{maxCorGrid} \alias{print.maxCor} +\alias{sMaxCorGrid} \title{(Robust) maximum correlation via alternating series of grid searches} \usage{ -maxCorGrid(x, y, method = c("spearman", "kendall", "quadrant", "M", - "pearson"), control = list(...), nIterations = 10, nAlternate = 10, - nGrid = 25, select = NULL, tol = 1e-06, standardize = TRUE, - fallback = FALSE, seed = NULL, ...) +maxCorGrid( + x, + y, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + control = list(...), + nIterations = 10, + nAlternate = 10, + nGrid = 25, + select = NULL, + tol = 1e-06, + standardize = TRUE, + fallback = FALSE, + seed = NULL, + ... +) + +sMaxCorGrid( + x, + y, + lambdaX = 0, + lambdaY = 0, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + control = list(...), + nIterations = 10, + nAlternate = 10, + nGrid = 25, + select = NULL, + tol = 1e-06, + standardize = TRUE, + fallback = FALSE, + K = 5, + R = 1, + type = c("random", "consecutive", "interleaved"), + grouping = NULL, + folds = NULL, + nCores = 1, + cl = NULL, + seed = NULL, + ... +) } \arguments{ \item{x, y}{each can be a numeric vector, matrix or data frame.} @@ -63,6 +100,27 @@ determining the order of the variables of the respective other data set.} \item{\dots}{additional arguments to be passed to the specified correlation functional.} + +\item{lambdaX, lambdaY}{numeric vectors of non-negative values giving the +penalty parameters for the weighting vectors of \code{x} and \code{y}, +respectively.} + +\item{K, R, type, grouping}{additional arguments for generating +cross-validation folds (see \code{\link{cvFolds}}).} + +\item{folds}{an object of class \code{"cvFolds"} (as returned by +\code{\link{cvFolds}}) defining the folds of the data for cross-validation. +If supplied, this is preferred over the arguments for generating +cross-validation folds.} + +\item{nCores}{a positive integer giving the number of processor cores to be +used for parallel computing in cross-validation (the default is 1 for no +parallelization). If this is set to \code{NA}, all available processor +cores are used.} + +\item{cl}{a \pkg{parallel} cluster for parallel computing to be used in +cross-validation, as generated by \code{\link[parallel]{makeCluster}}. If +supplied, this is preferred over \code{nCores}.} } \value{ An object of class \code{"maxCor"} with the following components: @@ -77,6 +135,16 @@ standardization of \code{y}.} standardization of \code{x}.} \item{scaleY}{a numeric vector giving the scale estimates used in standardization of \code{y}.} +\item{lambdaX}{a numeric giving the optimal penalty parameter for the +weighting vector of \code{x} (only \code{sMaxCorGrid}).} +\item{lambdaY}{a numeric giving the optimal penalty parameter for the +weighting vector of \code{y} (only \code{sMaxCorGrid}).} +\item{objective}{a numeric giving the value of the objective function +for the optimal combination of penalty parameters (only \code{sMaxCorGrid}).} +\item{cv}{a numeric matrix containing the (average) results from +cross-validation for each combination of penalty parameters (only +\code{sMaxCorGrid} and if a grid of values for the penalty parameters has +been supplied).} \item{call}{the matched function call.} } \description{ @@ -112,6 +180,13 @@ directly. Note that also the data sets are ordered according to the maximum average absolute correlation with the respective other data set to ensure symmetry of the algorithm. + +\code{sMaxCorGrid} enforces sparsity by adding \eqn{L_{1}}{L1} penalties on +the weighting vectors to the objective function (i.e., the correlation of +the resulting projections). The optimal combination of penalty paramters is +thereby determined via cross-validation over a grid of values. To increase +the computational performance of this cross-validation procedure, parallel +computing is implemented via package \pkg{parallel}. } \examples{ data("diabetes") @@ -132,8 +207,8 @@ between data sets: The \R Package \pkg{ccaPP}. \emph{Austrian Journal of Statistics}, \bold{45}(1), 71--79. A. Alfons, C. Croux and P. Filzmoser (2016) Robust maximum association -estimators. \emph{Journal of the American Statistical Association}. DOI -10.1080/01621459.2016.1148609. In press. +estimators. \emph{Journal of the American Statistical Association}. +\bold{112}(517), 436--445. } \seealso{ \code{\link{maxCorProj}}, \code{\link{ccaGrid}}, diff --git a/man/maxCorProj.Rd b/man/maxCorProj.Rd index d3de5c1..f6ae6ce 100644 --- a/man/maxCorProj.Rd +++ b/man/maxCorProj.Rd @@ -4,9 +4,16 @@ \alias{maxCorProj} \title{(Robust) maximum correlation via projections through the data points} \usage{ -maxCorProj(x, y, method = c("spearman", "kendall", "quadrant", "M", - "pearson"), control = list(...), standardize = TRUE, - useL1Median = TRUE, fallback = FALSE, ...) +maxCorProj( + x, + y, + method = c("spearman", "kendall", "quadrant", "M", "pearson"), + control = list(...), + standardize = TRUE, + useL1Median = TRUE, + fallback = FALSE, + ... +) } \arguments{ \item{x, y}{each can be a numeric vector, matrix or data frame.} diff --git a/man/permTest.Rd b/man/permTest.Rd index 3b7e771..c5bbb38 100644 --- a/man/permTest.Rd +++ b/man/permTest.Rd @@ -4,8 +4,17 @@ \alias{permTest} \title{(Robust) permutation test for no association} \usage{ -permTest(x, y, R = 1000, fun = maxCorGrid, permutations = NULL, - nCores = 1, cl = NULL, seed = NULL, ...) +permTest( + x, + y, + R = 1000, + fun = maxCorGrid, + permutations = NULL, + nCores = 1, + cl = NULL, + seed = NULL, + ... +) } \arguments{ \item{x, y}{each can be a numeric vector, matrix or data frame.} diff --git a/src/cca.cpp b/src/cca.cpp index 1658da5..31d5b68 100644 --- a/src/cca.cpp +++ b/src/cca.cpp @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * Erasmus University Rotterdam + * Erasmus Universiteit Rotterdam */ #include @@ -173,7 +173,7 @@ double CorMControl::cor(const vec& x, const vec& y) { // for testing purposes template -double fastCor(const vec& x, const vec& y, CorControl control) { +double fastCor(const vec& x, const vec& y, CorControl& control) { return control.cor(x, y); } @@ -241,22 +241,24 @@ class GridControl { GridControl(List&); // get the order in which to update the elements of a weighting vector template - void findOrder(const mat&, const vec&, CorControl, uvec&, double&, vec&); + void findOrder(const mat&, const vec&, CorControl&, uvec&, double&, vec&); // get the order in which to update the elements of the weighting vectors template - void findOrder(const mat&, const mat&, CorControl, uvec&, uvec&, double&, + void findOrder(const mat&, const mat&, CorControl&, uvec&, uvec&, double&, vec&, vec&, bool&); // get equispaced grid of angles in a plane vec getGrid(const uword&); + // compute updated weighting vector in grid search + vec getVector(const vec&, const double&, const uword&); // grid search to update one weighting vector template - void gridSearch(const mat&, const uvec&, const vec&, CorControl, vec&, + void gridSearch(const mat&, const uvec&, const vec&, CorControl&, vec&, double&, vec&); // set counter of consecutive iterations without improvement void setCounter(uword&, const double&, const double&); // maximum correlation between multivariate data sets x and y template - double maxCor(const mat&, const mat&, CorControl, vec&, vec&); + double maxCor(const mat&, const mat&, CorControl&, vec&, vec&); }; // constructors @@ -295,7 +297,7 @@ inline GridControl::GridControl(List& control) { // maxCor ....... maximum correlation to get initial value // a ............ weighting vector to get initial value template -void GridControl::findOrder(const mat& x, const vec& y, CorControl corControl, +void GridControl::findOrder(const mat& x, const vec& y, CorControl& corControl, uvec& orderX, double& maxCor, vec& a) { // compute columnwise absolute correlations of x with y const uword p = x.n_cols; @@ -325,7 +327,7 @@ void GridControl::findOrder(const mat& x, const vec& y, CorControl corControl, // startWithX ... logical to be computed that indicates whether to start with // the first data set in the alternate grid searches template -void GridControl::findOrder(const mat& x, const mat& y, CorControl corControl, +void GridControl::findOrder(const mat& x, const mat& y, CorControl& corControl, uvec& orderX, uvec& orderY, double& maxCor, vec& a, vec& b, bool& startWithX) { const uword p = x.n_cols, q = y.n_cols; @@ -418,32 +420,39 @@ vec GridControl::getGrid(const uword& i) { return grid; } +// compute updated weighting vector in grid search +vec GridControl::getVector(const vec& a, const double& angle, const uword& j) { + // fast computation of cos(angle) * a + sin(angle) * ej + vec b = cos(angle) * a; + b(j) += sin(angle); + return b; +} + // grid search to update one weighting vector // x ............ data matrix for which to update the weighting vector +// orderX ......... order in which to browse through the variables // y ............ linear combination of the other data matrix according to the // other weighting vector, which is kept fixed // corControl ... control object to compute correlation -// grid ........ grid points to be used for grid search +// grid ......... grid points to be used for grid search // maxCor ....... maximum correlation to be updated // a ............ weighting vector to be updated template void GridControl::gridSearch(const mat& x, const uvec& orderX, const vec& y, - CorControl corControl, vec& grid, double& maxCor, vec& a) { + CorControl& corControl, vec& grid, double& maxCor, vec& a) { // initializations const uword p = x.n_cols, nGrid = grid.n_elem; // perform grid searches for each canonical basis vector for(uword j = 0; j < p; j++) { - // define current canonical basis vector according to order of columns - vec ej = zeros(p); - ej(orderX(j)) = 1; + // current coordinate to be updated (in the order of columns) + uword orderJ = orderX(j); // perform grid search for the current canonical basis vector vec corY(nGrid); for(uword k = 0; k < nGrid; k++) { - double angle = grid(k); - vec currentA = cos(angle) * a + sin(angle) * ej; + vec currentA = getVector(a, grid(k), orderJ); corY(k) = abs(corControl.cor(x * currentA, y)); } - // find grid point that maximizes correlation functional and keep + // find grid point that maximizes the correlation functional and keep // maximum correlation of current grid search uword whichMax; double currentMaxCor = corY.max(whichMax); @@ -452,8 +461,7 @@ void GridControl::gridSearch(const mat& x, const uvec& orderX, const vec& y, // of the current grid search may be smaller than the previous maximum if(currentMaxCor > maxCor) { maxCor = currentMaxCor; - double optAngle = grid(whichMax); - a = cos(optAngle) * a + sin(optAngle) * ej; + a = getVector(a, grid(whichMax), orderJ); } } } @@ -479,7 +487,7 @@ void GridControl::setCounter(uword& counter, const double& current, // a ............. first weighting vector to be updated // b ............. second weighting vector to be updated template -double GridControl::maxCor(const mat& x, const mat& y, CorControl corControl, +double GridControl::maxCor(const mat& x, const mat& y, CorControl& corControl, vec& a, vec& b) { // initializations uword p = x.n_cols, q = y.n_cols; @@ -490,7 +498,12 @@ double GridControl::maxCor(const mat& x, const mat& y, CorControl corControl, // both data sets are univariate a.ones(p); b.ones(q); vec xx = x.unsafe_col(0), yy = y.unsafe_col(0); // reuse memory - maxCor = abs(corControl.cor(xx, yy)); // compute correlation + maxCor = corControl.cor(xx, yy); // compute correlation + // check sign of correlation + if(maxCor < 0) { + maxCor = -maxCor; + b = -b; + } } else { double previousMaxCor = R_NegInf; if((p > 1) && (q == 1)) { @@ -580,17 +593,17 @@ double GridControl::maxCor(const mat& x, const mat& y, CorControl corControl, } else { return NA_REAL; // should never happen } - } - // ensure that norm of weighting vectors is 1 - a = a / norm(a, 2); - b = b / norm(b, 2); - // check direction - double r = corControl.cor(x * a, y * b); - if(r < 0) { - if((p > 1) && (q == 1)) { - a = -a; - } else { - b = -b; + // ensure that norm of weighting vectors is 1 + a = a / norm(a, 2); + b = b / norm(b, 2); + // check direction + double r = corControl.cor(x * a, y * b); + if(r < 0) { + if((p > 1) && (q == 1)) { + a = -a; + } else { + b = -b; + } } } // return maximum correlation @@ -598,6 +611,320 @@ double GridControl::maxCor(const mat& x, const mat& y, CorControl corControl, } +// ------------------------------------------------ +// control class for sparse alternate grid searches +// ------------------------------------------------ + +class SparseGridControl: public GridControl { +public: + vec lambda; // grid of values for penalty parameters + // constructors + SparseGridControl(); + SparseGridControl(List&); + // compute updated weighting vector in grid search + vec getVector(const vec&, const double&, const uword&); + // grid search to update weighting vector for multivariate x and univariate y + template + void gridSearch(const mat&, const uvec&, const double&, const vec&, + CorControl&, vec&, vec&, double&); + // grid search to update one weighting vector for multivariate x and y + template + void gridSearch(const mat&, const uvec&, const double&, const vec&, + const double&, CorControl&, vec&, double&, vec&, double&, double&); + // workhorse function for maximum correlation between multivariate x and + // univariate y + template + void maxCorFit(const mat&, const uvec&, const double&, const vec&, + CorControl&, double&, vec&, double&); + // workhorse function for maximum correlation between multivariate x and y + template + void maxCorFit(const mat&, const uvec&, const double&, const mat&, + const uvec&, const double&, CorControl&, double&, vec&, vec&, + double&, double&, double&); + // maximum correlation between multivariate data sets x and y + template + vec maxCor(const mat&, const mat&, CorControl&, mat&, mat&, vec&); +}; + +// constructors +inline SparseGridControl::SparseGridControl() { + lambda = zeros(1, 2); +} +inline SparseGridControl::SparseGridControl(List& control) +: GridControl(control) { + SEXP R_lambda = control["lambda"]; + lambda = as(R_lambda); +} + +// compute updated weighting vector in grid search +vec SparseGridControl::getVector(const vec& a, + const double& angle, const uword& j) { + // fast computation of b / ||b|| with b = cos(angle) * a + sin(angle) * ej + double tanAngle = tan(angle); + double denominator = sqrt(1 + 2*tanAngle*a(j) + tanAngle*tanAngle); + vec b = a / denominator; + b(j) += tanAngle / denominator; + return b; +} + +// sparse grid search to update weighting vector for multivariate x and +// univariate y +// x .............. data matrix for which to update the weighting vector +// orderX ......... order in which to browse through the variables +// lambda ......... penalty parameter for the weighting vector to be updated +// y .............. other data vector +// corControl ..... control object to compute correlation +// grid ........... grid points to be used for grid search +// a .............. weighting vector to be updated +// maxObjective ... maximum value of the objective function to be updated +template +void SparseGridControl::gridSearch(const mat& x, const uvec& orderX, + const double& lambda, const vec& y, CorControl& corControl, + vec& grid, vec& a, double& maxObjective) { + // initializations + const uword p = x.n_cols, nGrid = grid.n_elem; + // perform grid searches for each canonical basis vector + for(uword j = 0; j < p; j++) { + // current coordinate to be updated (in the order of columns) + uword orderJ = orderX(j); + // perform grid search for the current canonical basis vector + vec objective(nGrid); + for(uword k = 0; k < nGrid; k++) { + vec currentA = getVector(a, grid(k), orderJ); + double currentCorY = abs(corControl.cor(x * currentA, y)); + objective(k) = currentCorY - lambda * norm(currentA, 1); + } + // find grid point that maximizes the penalized correlation functional + // and keep maximum of current grid search + uword whichMax; + double currentMaxObjective = objective.max(whichMax); + // update maximum and weighting vector + // if 0 degree angle is not part of the grid, the penalized maximum + // correlation of the current grid search may be smaller than the + // previous maximum + if(currentMaxObjective > maxObjective) { + maxObjective = currentMaxObjective; + a = getVector(a, grid(whichMax), orderJ); + } + } +} + +// sparse grid search to update one weighting vector for multivariate x and y +// x .............. data matrix for which to update the weighting vector +// orderX ......... order in which to browse through the variables +// lambda ......... penalty parameter for the weighting vector to be updated +// y .............. linear combination of the other data matrix according to +// the other weighting vector, which is kept fixed +// penaltyY ....... penalty for the weighting vector to kept fixed +// corControl ..... control object to compute correlation +// grid ........... grid points to be used for grid search +// maxCor ......... maximum correlation to be updated +// a .............. weighting vector to be updated +// penaltyX ....... penalty for the weighting vector to be updated +// maxObjective ... maximum value of the objective function to be updated +template +void SparseGridControl::gridSearch(const mat& x, const uvec& orderX, + const double& lambda, const vec& y, const double& penaltyY, + CorControl& corControl, vec& grid, double& maxCor, vec& a, + double& penaltyX, double& maxObjective) { + // initializations + const uword p = x.n_cols, nGrid = grid.n_elem; + // perform grid searches for each canonical basis vector + for(uword j = 0; j < p; j++) { + // current coordinate to be updated (in the order of columns) + uword orderJ = orderX(j); + // perform grid search for the current canonical basis vector + vec corY(nGrid), objective(nGrid), penalties(nGrid); + for(uword k = 0; k < nGrid; k++) { + vec currentA = getVector(a, grid(k), orderJ); + corY(k) = abs(corControl.cor(x * currentA, y)); + penalties(k) = lambda * norm(currentA, 1); + objective(k) = corY(k) - penalties(k) - penaltyY; + } + // find grid point that maximizes the penalized correlation functional + // and keep maximum of current grid search + uword whichMax; + double currentMaxObjective = objective.max(whichMax); + // update maximum and weighting vector + // if 0 degree angle is not part of the grid, the penalized maximum + // correlation of the current grid search may be smaller than the + // previous maximum + if(currentMaxObjective > maxObjective) { + maxCor = corY(whichMax); + a = getVector(a, grid(whichMax), orderJ); + penaltyX = penalties(whichMax); + maxObjective = currentMaxObjective; + } + } +} + +// workhorse function for maximum correlation between multivariate x and +// univariate y +template +void SparseGridControl::maxCorFit(const mat& x, const uvec& orderX, + const double& lambda, const vec& y, CorControl& corControl, + double& maxCor, vec& a, double& maxObjective) { + // initializations + double previousMaxObjective = R_NegInf; + // stop if there are two consecutive iterations without improvement + uword i = 0, convCounter = 0; + while((i < nIterations) && (convCounter < 2)) { + previousMaxObjective = maxObjective; + vec grid = getGrid(i+1); // define vector of grid points + gridSearch(x, orderX, lambda, y, corControl, grid, a, maxObjective); + i++; + setCounter(convCounter, maxObjective, previousMaxObjective); + } + // check direction + maxCor = corControl.cor(x * a, y); + if(maxCor < 0) { + maxCor = -maxCor; + a = -a; + } +} + +// workhorse function for maximum correlation between multivariate x and y +template +void SparseGridControl::maxCorFit(const mat& x, const uvec& orderX, + const double& lambdaX, const mat& y, const uvec& orderY, + const double& lambdaY, CorControl& corControl, double& maxCor, vec& a, + vec& b, double& penaltyX, double& penaltyY, double& maxObjective) { + // initializations + double previousMaxObjective = R_NegInf; + // stop if there are two consecutive iterations without improvement + uword i = 0, convCounter = 0; + while((i < nIterations) && (convCounter < 2)) { + previousMaxObjective = maxObjective; + vec grid = getGrid(i+1); // define vector of grid points + uword j = 0; + double altMaxObjective = R_NegInf; + while((j < nAlternate) && ((maxObjective - altMaxObjective) > tol)) { + altMaxObjective = maxObjective; + // maximize correlation functional over a keeping b fixed + vec yb = y * b; // linear combination of columns of y + gridSearch(x, orderX, lambdaX, yb, penaltyY, corControl, + grid, maxCor, a, penaltyX, maxObjective); + // maximize correlation functional over b keeping a fixed + vec xa = x * a; // linear combination of columns of x + gridSearch(y, orderY, lambdaY, xa, penaltyX, corControl, + grid, maxCor, b, penaltyY, maxObjective); + j++; + } + i++; + setCounter(convCounter, maxObjective, previousMaxObjective); + } + // check direction + maxCor = corControl.cor(x * a, y * b); + if(maxCor < 0) { + maxCor = -maxCor; + b = -b; + } +} + +// maximum correlation between multivariate data sets x and y based on +// sparse alternate grid searches +// x ............. first data matrix +// y ............. second data matrix +// corControl .... control object to compute correlation +// a ............. first weighting vector to be updated +// b ............. second weighting vector to be updated +template +vec SparseGridControl::maxCor(const mat& x, const mat& y, + CorControl& corControl, mat& A, mat& B, vec& objective) { + // initializations + uword p = x.n_cols, q = y.n_cols; + vec r; // initialize vector of maximum correlations + // perform alternate grid searches if both data sets are multivariate + // if one data set is univariate, alternate grid searches are not necessary + if((p == 1) && (q == 1)) { + // both data sets are univariate + r.set_size(1); objective.set_size(1); + A.ones(p, 1); B.ones(q, 1); + vec xx = x.unsafe_col(0), yy = y.unsafe_col(0); // reuse memory + // compute correlation + r(0) = corControl.cor(xx, yy); + // check sign of correlation + if(r(0) < 0) { + r(0) = -r(0); + B(0, 0) = -B(0, 0); + } + objective(0) = r(0); + } else { + double initialMaxCor, maxCor, maxObjective; + uword nLambda = lambda.n_rows; + r.set_size(nLambda); objective.set_size(nLambda); + if((p > 1) && (q == 1)) { + // x is multivariate, y is univariate + A.set_size(p, nLambda); B.ones(q, nLambda); + vec yy = y.unsafe_col(0); // reuse memory + // find order of x variables + uvec orderX(p); + vec initialA = zeros(p); + findOrder(x, yy, corControl, orderX, initialMaxCor, initialA); + // compute maximum correlation for each penalty parameter + for(uword k = 0; k < nLambda; k++) { + maxCor = initialMaxCor; + vec a = A.unsafe_col(k); a = initialA; + maxObjective = maxCor - lambda(k,0); // L1 norm is 1 + maxCorFit(x, orderX, lambda(k,0), yy, corControl, + maxCor, a, maxObjective); + r(k) = maxCor; + objective(k) = maxObjective; + } + } else if((p == 1) && (q > 1)) { + // x is univariate, y is multivariate + A.ones(p, nLambda); B.set_size(q, nLambda); + vec xx = x.unsafe_col(0); // reuse memory + // find order of y variables + uvec orderY(q); + vec initialB = zeros(q); + findOrder(y, xx, corControl, orderY, initialMaxCor, initialB); + // compute maximum correlation for each penalty parameter + for(uword k = 0; k < nLambda; k++) { + maxCor = initialMaxCor; + vec b = B.unsafe_col(k); b = initialB; + maxObjective = maxCor - lambda(k,1); // L1 norm is 1 + maxCorFit(y, orderY, lambda(k,1), xx, corControl, + maxCor, b, maxObjective); + r(k) = maxCor; + objective(k) = maxObjective; + } + } else if((p > 1) && (q > 1)) { + // both data sets are multivariate + A.set_size(p, nLambda), B.set_size(q, nLambda); + // find order of x and y variables + uvec orderX(p), orderY(q); + vec initialA = zeros(p), initialB = zeros(q); + bool startWithX; + findOrder(x, y, corControl, orderX, orderY, initialMaxCor, + initialA, initialB, startWithX); + // compute maximum correlation for each combination of penalty parameters + for(uword k = 0; k < nLambda; k++) { + maxCor = initialMaxCor; + vec a = A.unsafe_col(k), b = B.unsafe_col(k); + a = initialA; b = initialB; + double penaltyX = lambda(k,0), penaltyY = lambda(k,1); // L1 norms are 1 + maxObjective = maxCor - penaltyX - penaltyY; + // perform alternate grid searches + if(startWithX) { + // start with grid search for x + maxCorFit(x, orderX, lambda(k,0), y, orderY, lambda(k,1), corControl, + maxCor, a, b, penaltyX, penaltyY, maxObjective); + } else { + // start with grid search for y + maxCorFit(y, orderY, lambda(k,1), x, orderX, lambda(k,0), corControl, + maxCor, b, a, penaltyY, penaltyX, maxObjective); + } + r(k) = maxCor; + objective(k) = maxObjective; + } + } + } + // return maximum correlation + return r; +} + + // ------------------------------------------------- // control class for projections through data points // ------------------------------------------------- @@ -612,7 +939,7 @@ class ProjControl { mat getDirections(const mat&); // maximum correlation between multivariate data sets x and y template - double maxCor(const mat&, const mat&, CorControl, vec&, vec&); + double maxCor(const mat&, const mat&, CorControl&, vec&, vec&); }; // constructors @@ -654,7 +981,7 @@ mat ProjControl::getDirections(const mat& x) { // a ............ first weighting vector to be updated // b ............ second weighting vector to be updated template -double ProjControl::maxCor(const mat& x, const mat& y, CorControl corControl, +double ProjControl::maxCor(const mat& x, const mat& y, CorControl& corControl, vec& a, vec& b) { // initializations uword n = x.n_rows, p = x.n_cols, q = y.n_cols; @@ -822,6 +1149,7 @@ mat householder(const vec& a) { return P; } + // canonical correlation analysis via projection pursuit // x ............ first data matrix // y ............ second data matrix @@ -1018,3 +1346,108 @@ SEXP R_ccaPP(SEXP R_x, SEXP R_y, SEXP R_k, SEXP R_method, SEXP R_corControl, Named("scaleY") = scaleY ); } + + +// sparse maximum correlation via projection pursuit +// x ............ first data matrix +// y ............ second data matrix +// corControl ... control object to compute correlation +// ppControl .... control object for sparse algorithm +// standard ..... should the data be standardized? +// robust ....... should robust standardization be used? +// fallback ..... should the fallback mode for robust standardization be used? +// A ............ matrix of weighting vectors for first matrix to be updated +// B ............ matrix of weighting vectors for second matrix to be updated +template +vec sMaxCorPP(const mat& x, const mat& y, CorControl& corControl, + PPControl& ppControl, const bool& standard, const bool& robust, + const bool& fallback, mat& A, mat& B, vec& centerX, vec& centerY, + vec& scaleX, vec& scaleY, vec& objective) { + // initializations + vec r; + // standardize the data if requested + mat xs, ys; + if(standard) { + xs = standardize(x, robust, fallback, centerX, scaleX); + ys = standardize(y, robust, fallback, centerY, scaleY); + // compute maximum correlations with standardized data + r = ppControl.maxCor(xs, ys, corControl, A, B, objective); + // transform weighting vectors back to original scale + for(uword k = 0; k < A.n_cols; k++) { + vec a = A.unsafe_col(k); backtransform(a, scaleX); + } + for(uword k = 0; k < B.n_cols; k++) { + vec b = B.unsafe_col(k); backtransform(b, scaleY); + } + } else { + uword p = x.n_cols, q = y.n_cols; + centerX = zeros(p); centerY = zeros(q); + scaleX = ones(p); scaleY = ones(q); + // compute maximum correlations with original data + r = ppControl.maxCor(x, y, corControl, A, B, objective); + } + // return vector of maximum correlations + return r; +} + +// R interface +SEXP R_sMaxCorPP(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_corControl, + SEXP R_algorithm, SEXP R_ppControl, SEXP R_standardize, SEXP R_fallback) { + // initializations + NumericMatrix Rcpp_x(R_x), Rcpp_y(R_y); // convert data to Rcpp types + mat x(Rcpp_x.begin(), Rcpp_x.nrow(), Rcpp_x.ncol(), false); // convert data + mat y(Rcpp_y.begin(), Rcpp_y.nrow(), Rcpp_y.ncol(), false); // to arma types + string method = as(R_method); // convert character string + List Rcpp_corControl(R_corControl); // list of control parameters + string algorithm = as(R_algorithm); // convert character string + List Rcpp_ppControl(R_ppControl); // list of control parameters + bool standard = as(R_standardize); // convert to boolean + bool fallback = as(R_fallback); // convert to boolean + // initialize results + vec r, centerX, centerY, scaleX, scaleY, lambdaX, lambdaY, objective; + mat A, B; + if(algorithm == "grid") { + // define control object for alternate grid searches + SparseGridControl ppControl(Rcpp_ppControl); + // define control object for the correlations and call the arma version + if(method == "spearman") { + CorSpearmanControl corControl(Rcpp_corControl); + r = sMaxCorPP(x, y, corControl, ppControl, standard, true, fallback, + A, B, centerX, centerY, scaleX, scaleY, objective); + } else if(method == "kendall") { + CorKendallControl corControl(Rcpp_corControl); + r = sMaxCorPP(x, y, corControl, ppControl, standard, true, fallback, + A, B, centerX, centerY, scaleX, scaleY, objective); + } else if(method == "quadrant") { + CorQuadrantControl corControl(Rcpp_corControl); + r = sMaxCorPP(x, y, corControl, ppControl, standard, true, fallback, + A, B, centerX, centerY, scaleX, scaleY, objective); + } else if(method == "M") { + CorMControl corControl(Rcpp_corControl); + r = sMaxCorPP(x, y, corControl, ppControl, standard, true, fallback, + A, B, centerX, centerY, scaleX, scaleY, objective); + } else if(method == "pearson") { + CorPearsonControl corControl; + r = sMaxCorPP(x, y, corControl, ppControl, standard, false, false, + A, B, centerX, centerY, scaleX, scaleY, objective); + } else { + error("method not available"); + } + lambdaX = ppControl.lambda.col(0); lambdaY = ppControl.lambda.col(1); + } else { + error("algorithm not available"); + } + // wrap and return result + return List::create( + Named("cor") = wrap(r.begin(), r.end()), + Named("a") = A, + Named("b") = B, + Named("centerX") = centerX, + Named("centerY") = centerY, + Named("scaleX") = scaleX, + Named("scaleY") = scaleY, + Named("lambdaX") = wrap(lambdaX.begin(), lambdaX.end()), + Named("lambdaY") = wrap(lambdaY.begin(), lambdaY.end()), + Named("objective") = wrap(objective.begin(), objective.end()) + ); +} diff --git a/src/cca.h b/src/cca.h index fd8f99a..06a4c4a 100644 --- a/src/cca.h +++ b/src/cca.h @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * Erasmus University Rotterdam + * Erasmus Universiteit Rotterdam */ #ifndef _ccaPP_CCA_H @@ -17,5 +17,7 @@ using namespace Rcpp; RcppExport SEXP R_fastCor(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_control); // for testing RcppExport SEXP R_ccaPP(SEXP R_x, SEXP R_y, SEXP R_k, SEXP R_method, SEXP R_corControl, SEXP R_algorithm, SEXP R_ppControl, SEXP R_standardize, SEXP R_fallback); +RcppExport SEXP R_sMaxCorPP(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_corControl, + SEXP R_algorithm, SEXP R_ppControl, SEXP R_standardize, SEXP R_fallback); #endif diff --git a/src/ccaPP_init.c b/src/ccaPP_init.c index f364117..9ec30e5 100644 --- a/src/ccaPP_init.c +++ b/src/ccaPP_init.c @@ -10,10 +10,12 @@ extern SEXP R_corM(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP R_corPearson(SEXP, SEXP); extern SEXP R_corQuadrant(SEXP, SEXP, SEXP); extern SEXP R_corSpearman(SEXP, SEXP, SEXP); +extern SEXP R_fastCor(SEXP, SEXP, SEXP, SEXP); extern SEXP R_fastMAD(SEXP, SEXP); extern SEXP R_fastMedian(SEXP); extern SEXP R_l1Median(SEXP); extern SEXP R_rank(SEXP); +extern SEXP R_sMaxCorPP(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"R_ccaPP", (DL_FUNC) &R_ccaPP, 9}, @@ -22,10 +24,12 @@ static const R_CallMethodDef CallEntries[] = { {"R_corPearson", (DL_FUNC) &R_corPearson, 2}, {"R_corQuadrant", (DL_FUNC) &R_corQuadrant, 3}, {"R_corSpearman", (DL_FUNC) &R_corSpearman, 3}, + {"R_fastCor", (DL_FUNC) &R_fastCor, 4}, {"R_fastMAD", (DL_FUNC) &R_fastMAD, 2}, {"R_fastMedian", (DL_FUNC) &R_fastMedian, 1}, {"R_l1Median", (DL_FUNC) &R_l1Median, 1}, {"R_rank", (DL_FUNC) &R_rank, 1}, + {"R_sMaxCorPP", (DL_FUNC) &R_sMaxCorPP, 8}, {NULL, NULL, 0} }; diff --git a/src/cor.cpp b/src/cor.cpp index 27eff5d..c144901 100644 --- a/src/cor.cpp +++ b/src/cor.cpp @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * KU Leuven + * Erasmus Universiteit Rotterdam */ #include diff --git a/src/cor.h b/src/cor.h index 8d646fc..22179a4 100644 --- a/src/cor.h +++ b/src/cor.h @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * KU Leuven + * Erasmus Universiteit Rotterdam */ #ifndef _ccaPP_COR_H diff --git a/src/fastCorKendall.cpp b/src/fastCorKendall.cpp index 3be9dc4..537f9aa 100755 --- a/src/fastCorKendall.cpp +++ b/src/fastCorKendall.cpp @@ -36,7 +36,7 @@ /* * Modifications: Andreas Alfons - * KU Leuven + * Erasmus Universiteit Rotterdam */ diff --git a/src/fastCorKendall.h b/src/fastCorKendall.h index 1dd033a..405dd9a 100644 --- a/src/fastCorKendall.h +++ b/src/fastCorKendall.h @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * KU Leuven + * Erasmus Universiteit Rotterdam */ #ifndef _ccaPP_FASTCORKENDALL_H diff --git a/src/utils.cpp b/src/utils.cpp index 83470b6..0255f8f 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * Erasmus University Rotterdam + * Erasmus Universiteit Rotterdam */ #include diff --git a/src/utils.h b/src/utils.h index 2186d60..53c84e8 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,6 +1,6 @@ /* * Author: Andreas Alfons - * Erasmus University Rotterdam + * Erasmus Universiteit Rotterdam */ #ifndef _ccaPP_UTILS_H