Commit 8b06fd74 by Tom Reynkens

### Always use "predictor" instead of "variable"

parent b70c7154
 ... ... @@ -16,7 +16,7 @@ # pen.cov: List with penalty type per predictor # n.par.cov: List with number of parameters to estimate per predictor # pen.mat: List with (weighted) penalty matrix per predictor # pen.mat.transform: List with inverse of (weighted) penalty matrix for ordinal variables (Fused Lasso penalty) # pen.mat.transform: List with inverse of (weighted) penalty matrix for ordinal predictors (Fused Lasso penalty) .max.lambda <- function(X, y, weights, start, offset, family, pen.cov, n.par.cov, pen.mat, pen.mat.transform) { # Number of covariates ... ... @@ -67,7 +67,7 @@ lambda.max[j] <- .min.maxnorm(A = t(pen.mat[[j]]), b = beta.tilde.split[[j]]) } else { warning(paste0("'lambda.max' cannot be determined for variable '", names(pen.cov)[j], "'.")) warning(paste0("'lambda.max' cannot be determined for predictor '", names(pen.cov)[j], "'.")) } } } ... ...
 ... ... @@ -24,7 +24,7 @@ # lambda2: List with lambda2 multiplied with penalty weights (for Group Lasso penalty) per predictor. The penalty parameter for the L_2-penalty in Group (Generalized) Fused Lasso or Group Graph-Guided Fused Lasso is lambda*lambda2 # pen.mat: List with (weighted) penalty matrix per predictor # pen.mat.aux: List with eigenvector matrix and eigenvalue vector of (weighted) penalty matrix per predictor # pen.mat.transform: List with inverse of (weighted) penalty matrix for ordinal variables (Fused Lasso penalty) # pen.mat.transform: List with inverse of (weighted) penalty matrix for ordinal predictors (Fused Lasso penalty) # valdata: Validation data when selecting lambda out-of-sample # control: List of parameters used in the fitting process .select.lambda <- function(X, y, weights, start, offset, family, pen.cov, n.par.cov, group.cov, refcat.cov, ... ... @@ -91,7 +91,7 @@ cv.fold.allocation[tmp[, 1L]] <- tmp[, 2L] } # Indices of variables with Lasso or Group Lasso penalty # Indices of predictors with Lasso or Group Lasso penalty ind.stand <- which(pen.cov %in% c("lasso", "grouplasso")) ################################## ... ... @@ -207,7 +207,7 @@ " Please use a different (larger) value for 'k' in the control object.")) } # Standardize X.train matrix for variables with Lasso or Group Lasso penalty (if standardize = TRUE) # Standardize X.train matrix for predictors with Lasso or Group Lasso penalty (if standardize = TRUE) list.stand <- .X.stand(X = X.train, standardize = attr(X, "standardize"), ind.stand = ind.stand, n.par.cov = n.par.cov, weights = weights.train) # Possibly standardized X.train matrix ... ... @@ -282,7 +282,7 @@ cl.lambda <- makeCluster(min(control$ncores, n.folds), type = ifelse(.Platform$OS.type == "unix", "FORK", "PSOCK"), outfile = "") # Export variables to use in cluster # Export predictors to use in cluster clusterExport(cl.lambda, varlist = c("n.folds", "y", "X", "family", "weights", "start", "offset", "n.par.cov", "pen.cov", "ind.stand", "group.cov", "refcat.cov", "lambda.vector", "lambda1", "lambda2", ... ... @@ -314,11 +314,11 @@ X.sds <- attr(X, "X.sds") for (j in ind.stand) { # Loop over variables with Lasso or Group Lasso penalty # Loop over predictors with Lasso or Group Lasso penalty # Index of first coefficient corresponding to this variable # Index of first coefficient corresponding to this predictor ind.start <- ifelse(j == 1, 1L, sum(unlist(n.par.cov[1:(j-1L)])) + 1L) # Index of last coefficient corresponding to this variable # Index of last coefficient corresponding to this predictor ind.end <- sum(unlist(n.par.cov[1:j])) # Indices ind.s.e <- ind.start:ind.end ... ...
 ... ... @@ -31,7 +31,7 @@ ### # Vector of norms for Group Lasso norms.group <- group.cov # Compute norms per variable (for Group Lasso penalty) # Compute norms per predictor (for Group Lasso penalty) for (j in which(pen.cov == "grouplasso")) { norms.group[j] <- sqrt(sum(beta.tilde.split[[j]]^2)) } ... ... @@ -42,10 +42,10 @@ if (length(groups.unique.nz) > 0) { for (j in 1:length(groups.unique.nz)) { # Indices of variables in the group # Indices of predictors in the group ind.group <- which(group.cov == groups.unique.nz[j]) # Combine norms in a group by summing the squared norms per variable # For every variable in the group, the vector element is now equal to the norm of the group! # Combine norms in a group by summing the squared norms per predictor # For every predictor in the group, the vector element is now equal to the norm of the group! norms.group[ind.group] <- sqrt(sum(norms.group[ind.group]^2)) } } ... ... @@ -66,7 +66,7 @@ } else if (pen.cov[[j]] == "grouplasso") { # Note that the norm of all coefficients in the group is used. # For group 0 (i.e. no group), only the norm of the coefficients for variable j is used. # For group 0 (i.e. no group), only the norm of the coefficients for predictor j is used. return(.PO.GroupLasso(beta.tilde = beta.tilde.split[[j]], slambda = diag(pen.mat.cov[[j]]) * lambda * step, norm = norms.group[j])) ... ... @@ -101,7 +101,7 @@ # Create clusters, use forking on Unix platforms cl <- makeCluster(po.ncores, type = ifelse(.Platform$OS.type == "unix", "FORK", "PSOCK")) # Export variables to use in cluster # Export predictors to use in cluster clusterExport(cl, varlist = c("beta.tilde.split", "lambda", "lambda1", "lambda2", "step", "pen.mat.cov", "pen.mat.cov.aux", "beta.old.split", "norms.group"), envir = environment()) ... ...  ... ... @@ -34,11 +34,11 @@ } else if (pen.cov[[j]] == "grouplasso") { if (group.cov[[j]] == 0) { # Only for variables in group 0 (i.e. no group) # Only for predictors in group 0 (i.e. no group) pen <- lambda * sqrt(sum((pen.mat.cov[[j]] %*% beta)^2)) } else { # Variables not in group 0 are treated later # Predictors not in group 0 are treated later pen <- 0 } ... ... @@ -65,12 +65,12 @@ # Loop over groups (except group 0) for (j in 1:length(groups.unique.nz)) { # Indices of variables in the group # Indices of predictors in the group ind.group <- which(group.cov == groups.unique.nz[j]) swn <- 0 for (l in ind.group) { # Compute contribution of variable to squared weighted norm (swn) and add to swn # Compute contribution of predictor to squared weighted norm (swn) and add to swn swn <- swn + sum((pen.mat.cov[[l]] %*% beta.split[[l]])^2) } # Multiply weighted norm for group with lambda and add to total penalty ... ...  ... ... @@ -5,27 +5,27 @@ ############################################### # Penalty matrix for Lasso penalized variable # Penalty matrix for Lasso penalized predictor # # n.par: Number of parameters for the variable # n.par: Number of parameters for the predictor .pen.mat.lasso <- function(n.par) { return(diag(n.par)) } # Penalty matrix for Group Lasso penalized variable # Penalty matrix for Group Lasso penalized predictor # # n.par: Number of parameters for the variable # n.par: Number of parameters for the predictor .pen.mat.grouplasso <- function(n.par) { return(diag(n.par)) } # Penalty matrix for Fused Lasso penalized variable # Penalty matrix for Fused Lasso penalized predictor # # n.par: Number of parameters for the variable # n.par: Number of parameters for the predictor # refcat: Numeric indicating which of the levels is the reference category, default is first one # when no reference category is present it is 0 .pen.mat.flasso <- function(n.par, refcat = 1) { ... ... @@ -43,9 +43,9 @@ } # Penalty matrix for Generalized Fused Lasso penalized variable # Penalty matrix for Generalized Fused Lasso penalized predictor # # n.par: Number of parameters for the variable # n.par: Number of parameters for the predictor # refcat: Logical which indicates if a reference category is present .pen.mat.gflasso <- function(n.par, refcat = TRUE) { ... ... @@ -68,8 +68,8 @@ # Penalty matrix for 2D Fused Lasso penalty # # n.par.row: Number of parameters for the column variable # n.par.col: Number of parameters for the row variable # n.par.row: Number of parameters for the column predictor # n.par.col: Number of parameters for the row predictor .pen.mat.2dflasso <- function(n.par.row, n.par.col) { total.columns <- n.par.row * n.par.col ... ... @@ -110,10 +110,10 @@ } # Compute penalty matrix for Graph-Guided Fused Lasso penalized variable based on adjacency matrix # Compute penalty matrix for Graph-Guided Fused Lasso penalized predictor based on adjacency matrix # # adj.matrix: Adjacency matrix of the graph # lev.names: Names of the levels of the variable # lev.names: Names of the levels of the predictor # refcat: Numeric indicating which of the levels is the reference category, default is first one # when no reference category is present it is 0 .pen.mat.ggflasso <- function(adj.matrix, lev.names = NULL, refcat = 1) { ... ...  ... ... @@ -59,7 +59,7 @@ # Multiple elements of pen.weights have wrong length } else if (length(ind) > 1) { # Give variable names if not empty and number otherwise # Give predictor names if not empty and number otherwise if (is.null(names(ind))) { tmp <- paste(as.character(ind), collapse=", ") } else { ... ... @@ -86,7 +86,7 @@ # Multiple elements of pen.weights have wrong names } else if (length(ind) > 1) { # Give variable names if not empty and number otherwise # Give predictor names if not empty and number otherwise if (is.null(names(ind))) { tmp <- paste(as.character(ind), collapse=", ") } else { ... ... @@ -115,7 +115,7 @@ # offset: Vector containing the offset for the model # family: Family object # standardize: Logical indicating if the columns of X corresponding to Lasso or Group Lasso penalties # ind.stand: Indices of variables with a Lasso or a Group Lasso penalty # ind.stand: Indices of predictors with a Lasso or a Group Lasso penalty # refcat: Indicator for reference category for Fused Lasso, Generalized Fused Lasso and Graph-Guided Fused Lasso # n: Sample size # lambda1: List with lambda1 per predictor. The penalty parameter for the L_1-penalty in Sparse (Generalized) Fused Lasso or Sparse Graph-Guided Fused Lasso is lambda*lambda1 ... ... @@ -139,7 +139,7 @@ pen.weights <- n.par.cov # Zero weight for non-penalized variables, hence no need to compute weights # Zero weight for non-penalized predictors, hence no need to compute weights # if only penalties of these types if (!all(pen.cov == "none")) { ... ... @@ -224,11 +224,11 @@ if (pen.cov[[j]] == "grouplasso") { if (group.cov[[j]] == 0) { # Only for variables in group 0 (i.e. no group) # Only for predictors in group 0 (i.e. no group) pen.weights[[j]] <- as.numeric(1 / sqrt(sum((pen.mat[[j]] %*% beta.weights.split[[j]])^2))) } else { # Variables not in group 0 are treated later # Predictors not in group 0 are treated later pen.weights[[j]] <- 0 } ... ... @@ -256,12 +256,12 @@ # Loop over groups (except group 0) for (j in 1:length(groups.unique.nz)) { # Indices of variables in the group # Indices of predictors in the group ind.group <- which(group.cov == groups.unique.nz[j]) swn <- 0 for (l in ind.group) { # Compute contribution of variable to squared weighted norm (swn) and add to swn # Compute contribution of predictor to squared weighted norm (swn) and add to swn swn <- swn + sum((pen.mat[[l]] %*% beta.weights.split[[l]])^2) } ... ... @@ -276,7 +276,7 @@ } else { # No need for standardization since zero weight for non-penalized variables # No need for standardization since zero weight for non-penalized predictors pen.weights.stand <- FALSE } ... ...  ... ... @@ -69,7 +69,7 @@ if (is.factor(data.gam[, cov.name])) { # Categorical variable # Categorical predictor if (any(is.na(suppressWarnings(as.numeric(levels(data.gam[, cov.name])))))) { # Non-numeric factor levels ... ... @@ -81,7 +81,7 @@ } } else { # Lasso or Group Lasso with continuous variable # Lasso or Group Lasso with continuous predictor pen.cov.gam <- paste0(pen.cov.gam, ".numeric") } } ... ... @@ -98,7 +98,7 @@ stop("GAM penalty weights do not work when a non-numeric factor has a Fused Lasso penalty and lambda1 > 0 or lambda2 > 0. Please use GLM penalty weights instead.") } # Add variables to formula # Add predictors to formula if (pen.cov.gam %in% c("none.numeric", "none.factor.nonnumeric", "lasso.numeric", "lasso.factor.nonnumeric", "grouplasso.numeric", "grouplasso.factor.nonnumeric", ... ... @@ -137,9 +137,9 @@ # Remove ".binned" if present cov.names <- gsub(".binned*", "", cov.names) # Check if cov.names exist as variable names # Check if cov.names exist as predictor names if (any(!cov.names %in% colnames(data.gam))) { stop("Binned factors for interaction should have the original variable name + '.binned' as variable names, e.g. 'age.binned.") stop("Binned factors for interaction should have the original predictor name + '.binned' as predictor names, e.g. 'age.binned.") } # TODO: add this to help file ... ... @@ -191,11 +191,11 @@ # Index of reference class, only used if refcat=TRUE ref.ind <- which(data.gam[, cov.name] == levels(data.gam[, cov.name])[1])[1] # Check if cov.names exist as variable names # Check if cov.names exist as predictor names if (!all(cov.names %in% colnames(data.gam))) { warning(paste0("Latitude and longitude values for 'ggflasso' should have the original variable name", " + '.lat' and '.lon' as variable names, e.g. 'zip.lat' and 'zip.lon. Since they are not given, ", "a non-smooth effect is used for variable ", deparse(substitute(cov.name)), ".")) warning(paste0("Latitude and longitude values for 'ggflasso' should have the original predictor name", " + '.lat' and '.lon' as predictor names, e.g. 'zip.lat' and 'zip.lon. Since they are not given, ", "a non-smooth effect is used for predictor ", deparse(substitute(cov.name)), ".")) if (!refcat) { stop("GAM penalty weights do not work for 'ggflasso' and lambda1 > 0 or lambda2 > 0 when no latitude and longitude are given. Please use GLM penalty weights instead.") ... ... @@ -268,7 +268,7 @@ pen.cov.gam[[j]] <- attr(mf_nodrop[, j], "penalty") } # Add variables to formula # Add predictors to formula if (pen.cov.gam[[j]] %in% c("none", "lasso", "grouplasso")) { if (is.factor(data.gam[, cov.name])) { ... ... @@ -277,13 +277,13 @@ if (any(is.na(suppressWarnings(as.numeric(levels(data.gam[, cov.name])))))) { # No numeric levels # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = cov.name) } else { # Numeric levels # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = paste0("s(", cov.name, ".numer)")) } ... ... @@ -333,13 +333,13 @@ if (any(is.na(suppressWarnings(as.numeric(levels(data.gam[, cov.name])))))) { # No numeric levels # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = cov.name) } else { # Numeric levels # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = paste0("s(", cov.name, ".numer)")) } ... ... @@ -357,7 +357,7 @@ } else if (pen.cov.gam[[j]] == "gflasso") { # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = cov.name) # Get coefficient per level (except reference level) ... ... @@ -380,14 +380,14 @@ # Remove ".binned" if present cov.names <- gsub(".binned*", "", cov.names_orig) # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = paste0("s(", cov.names[1], ".numer,", cov.names[2], ".numer):dummy.", cov.names[1], ".", cov.names[2])) # Levels of first (binned) variable # Levels of first (binned) predictor lev.1 <- levels(data.gam[, cov.names_orig[1]]) # Levels of second (binned) variable # Levels of second (binned) predictor lev.2 <- levels(data.gam[, cov.names_orig[2]]) ind2 <- 1L ... ... @@ -404,15 +404,15 @@ } else if (pen.cov.gam[[j]] == "ggflasso") { # Check if exist as variable names # Check if exist as predictor names if (!all(c(paste0(cov.name, ".lat"), paste0(cov.name, ".lon")) %in% colnames(data.gam))) { # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = cov.name) } else { # Predict GAM model for current variable # Predict GAM model for current predictor pg <- predict.gam(gam.fit.pw, newdata = data.gam, type = "terms", terms = paste0("s(", cov.name, ".lat,", cov.name, ".lon)")) ... ...  ... ... @@ -189,7 +189,7 @@ stop("'n.par.cov' and 'pen.cov' should be lists of equal length.") } # Index of variables with 2D fused Lasso penalty # Index of predictors with 2D fused Lasso penalty ind2d <- which(pen.cov == "2dflasso") if (length(ind2d) > 0) { ... ... @@ -222,7 +222,7 @@ n.par.cov.2dflasso <- NULL } # Index of variables without 2D fused Lasso penalty # Index of predictors without 2D fused Lasso penalty ind.non.2d <- which(pen.cov != "2dflasso") if (length(ind.non.2d) > 0) { ... ... @@ -425,10 +425,10 @@ adj.matrix <- list() } # Indices of variables with ggflasso penalty # Indices of predictors with ggflasso penalty ind.adj <- which(pen.cov == "ggflasso") # Transform to list if adjacency matrix is not given as a list and only one variable with ggflasso penalty # Transform to list if adjacency matrix is not given as a list and only one predictor with ggflasso penalty if (length(ind.adj) == 1 & !is.list(adj.matrix)) { adj.matrix <- list(adj.matrix) names(adj.matrix)[1] <- names(pen.cov)[ind.adj] ... ... @@ -466,13 +466,13 @@ stop("A numeric matrix or element of class Matrix was expected in element ", i, " of 'adj.matrix'.") } # Check if list element of adj.matrix has correct number of rows (number of levels of the corresponding variable) # Check if list element of adj.matrix has correct number of rows (number of levels of the corresponding predictor) if ((n.par.cov[[ind.adj[i]]] + refcat) != nrow(adj.matrix[[i]])) { stop("Element ", i, " of 'adj.matrix' needs to be a square matrix with ", (n.par.cov[[ind.adj[i]]] + refcat), " rows (the number of levels of the corresponding variable).") " rows (the number of levels of the corresponding predictor).") } # Level names (without reference level if refcat=TRUE) of variable # Level names (without reference level if refcat=TRUE) of predictor lev.names <- gsub(names(pen.cov)[ind.adj[i]], "", unlist(col.names.split[ind.adj[i]])) # Row names of adjacency matrix spat.names <- rownames(adj.matrix[[i]]) ... ... @@ -483,7 +483,7 @@ } if (!isTRUE(all.equal(as.character(lev.names), spat.names))) { stop("The rownames of element ", i, " of 'adj.matrix' are not the same as the level names of the corresponding variable.", stop("The rownames of element ", i, " of 'adj.matrix' are not the same as the level names of the corresponding predictor.", " Note that the order of the names is also important.") } } ... ...  ... ... @@ -35,15 +35,15 @@ #' \item{3}{Low step size (i.e. below 1e-14).} #' }} #' \item{final.stepsize}{Final step size used in the algorithm.} #' \item{n.par.cov}{List with number of parameters to estimate per covariate (predictor).} #' \item{pen.cov}{List with penalty type per covariate (predictor).} #' \item{group.cov}{List with group of each covariate (predictor) for Group Lasso where 0 means no group.} #' \item{refcat.cov}{List with number of the reference category in the original order of the levels of each covariate (predictor) where 0 indicates no reference category.} #' \item{n.par.cov}{List with number of parameters to estimate per predictor (covariate).} #' \item{pen.cov}{List with penalty type per predictor (covariate).} #' \item{group.cov}{List with group of each predictor (covariate) for Group Lasso where 0 means no group.} #' \item{refcat.cov}{List with number of the reference category in the original order of the levels of each predictor (covariate) where 0 indicates no reference category.} #' \item{control}{The used control list, see \code{\link{glmsmurf.control}}.} #' Optionally, following elements are also included: #' \item{X}{The model matrix, only returned when the argument \code{x.return} in \code{\link{glmsmurf}} or \code{\link{glmsmurf.fit}} is \code{TRUE}.} #' \item{y}{The response vector, only returned when the argument \code{y.return} in \code{\link{glmsmurf}} or \code{\link{glmsmurf.fit}} is \code{TRUE}.} #' \item{pen.weights}{List with the vector of penalty weights per covariate (predictor), only returned when the argument \code{pen.weights.return} in \code{\link{glmsmurf}} or \code{\link{glmsmurf.fit}} is \code{TRUE}.} #' \item{pen.weights}{List with the vector of penalty weights per predictor (covariate), only returned when the argument \code{pen.weights.return} in \code{\link{glmsmurf}} or \code{\link{glmsmurf.fit}} is \code{TRUE}.} #' When the model is re-estimated, i.e. \code{reest = TRUE} in \code{\link{glmsmurf.control}}, #' the following components are also present: #' \item{glm.reest}{Output from the call to \code{\link[speedglm]{speedglm}} or \code{\link[stats]{glm}} to fit the re-estimated model.} ... ...  ... ... @@ -51,7 +51,7 @@ #' \item \code{"gam"} (ad. GAM penalty weights), #' \item \code{"gam.stand"} (stand. ad. GAM penalty weights); #' } #' or a list with the penalty weight vector per covariate (predictor). This list should have length equal to the number of predictors and predictor names as element names. #' or a list with the penalty weight vector per predictor. This list should have length equal to the number of predictors and predictor names as element names. #' @param adj.matrix A named list containing the adjacency matrices (a.k.a. neighbor matrices) for each of the predictors with a Graph-Guided Fused Lasso penalty. #' The list elements should have the names of the corresponding predictors. If only one predictor has a Graph-Guided Fused Lasso penalty, #' it is also possible to only give the adjacency matrix itself (not in a list). ... ... @@ -59,7 +59,7 @@ #' The returned coefficients are always on the original (i.e. non-standardized) scale. #' @param x.return Logical indicating if the used model matrix should be returned in the output object, default is \code{FALSE}. #' @param y.return Logical indicating if the used response vector should be returned in the output object, default is \code{TRUE}. #' @param pen.weights.return Logical indicating if the list of the used penalty weight vector per covariate (predictor) should be returned in the output object, default is \code{FALSE}. #' @param pen.weights.return Logical indicating if the list of the used penalty weight vector per predictor should be returned in the output object, default is \code{FALSE}. #' @param control A list of parameters used in the fitting process. This is passed to \code{\link{glmsmurf.control}}. #' @return An object of class \code{"glmsmurf"} is returned. See \code{\link{glmsmurf-class}} for more details about this class and its generic functions. #' ... ... @@ -212,7 +212,7 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb # Check for unused factor levels if (is.factor(mf_nodrop[, j])) { if (!isTRUE(all.equal(levels(mf_nodrop[, j]), levels(mf[, j])))) { stop(paste0("No unused levels can be present in the variable '", cov.names[j + (inter - 1)],"'.", stop(paste0("No unused levels can be present in the predictor '", cov.names[j + (inter - 1)],"'.", " Please remove the unused factor levels.")) } } ... ... @@ -230,14 +230,14 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb } } else if (pen.cov[[j + (inter - 1)]] %in% c("lasso", "grouplasso")) { # No reference category for Lasso and Group Lasso penalized variables, # No reference category for Lasso and Group Lasso penalized predictors, # this is imposed through contrast.args # Only use contrasts if a factor if (is.factor(mf[, j])) { name <- colnames(mf)[j] # Set contrasts to FALSE for variable => no reference category # Note that the list element has the same name as the variable # Set contrasts to FALSE for predictor => no reference category # Note that the list element has the same name as the predictor contrasts[[name]] <- contrasts(mf[, j], contrasts = FALSE) } ... ... @@ -250,8 +250,8 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb # No reference category, this is imposed through contrast.args name <- colnames(mf)[j] # Set contrasts to FALSE for variable => no reference category # Note that the list element has the same name as the variable # Set contrasts to FALSE for predictor => no reference category # Note that the list element has the same name as the predictor contrasts[[name]] <- contrasts(mf[, j], contrasts = FALSE) # Number of parameters to estimate ... ... @@ -272,7 +272,7 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb # Check if 1D effects (or non-binned 1D effects) are also included error2d <- paste0(" should also be included in the model ", "since this variable (or a binned version) is present in an interaction. ", "since this predictor (or a binned version) is present in an interaction. ", "Please mind that 1D effects need to be included in the formula before 2D effects!") ... ... @@ -289,16 +289,16 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb # Levels of pred2 l2 <- levels(data[, attr(mf_nodrop[, j], "cov.names")[[2]]]) # Check that reference category is not changed for both 1D variables (or their non-binned versions) # Check that reference category is not changed for both 1D predictors (or their non-binned versions) n1 <- gsub(".binned*", "", attr(mf_nodrop[, j], "cov.names")[[1]]) if (refcat.cov[[which(cov.names == n1)]] > 1) { stop(paste0("The reference category for the variable '", n1, stop(paste0("The reference category for the predictor '", n1, " 'cannot be changed as it (or its binned version) is included in a 2D effect.")) } n2 <- gsub(".binned*", "", attr(mf_nodrop[, j], "cov.names")[[2]]) if (refcat.cov[[which(cov.names == n2)]] > 1) { stop(paste0("The reference category for the variable '", n2, stop(paste0("The reference category for the predictor '", n2, " 'cannot be changed as it (or its binned version) is included in a 2D effect.")) } ... ... @@ -339,7 +339,7 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb names(pen.cov) <- names(n.par.cov) <- names(group.cov) <- names(refcat.cov) <- cov.names if (anyDuplicated(cov.names) > 0) { stop("A variable can only be included once in the formula (except in a 2D Fused Lasso).") stop("A predictor can only be included once in the formula (except in a 2D Fused Lasso).") } ######### ... ... @@ -418,7 +418,7 @@ glmsmurf <- function(formula, family, data, weights, start, offset, lambda, lamb if (inter_bool) { # Combine two variable names # Combine two predictor names col.names[j] <- gsub(",\\s*", ".", col.names[j]) # Add "inter." col.names[j] <- paste0("inter.", col.names[j]) ... ...  ... ... @@ -12,15 +12,15 @@ #' @param X Only for \code{glmsmurf.fit}: the design matrix including ones for the intercept. A \code{n} by \code{(p+1)} matrix which can #' be of numeric matrix class (\code{\link[methods:StructureClasses]{matrix-class}}) or of class Matrix (\code{\link[Matrix]{Matrix-class}}) including sparse matrix class (\code{\link[Matrix]{dgCMatrix-class}}). #' @param y Only for \code{glmsmurf.fit}: the response vector, a numeric vector of size \code{n}. #' @param pen.cov Only for \code{glmsmurf.fit}: list with penalty type per covariate (predictor). A named list of strings with covariate (predictor) names as element names. #' @param pen.cov Only for \code{glmsmurf.fit}: list with penalty type per predictor (covariate). A named list of strings with predictor names as element names. #' Possible types: \code{"none"} (no penalty, e.g. for intercept), \code{"lasso"}, \code{"grouplasso"}, #' \code{"flasso"} (Fused Lasso), \code{"gflasso"} (Generalized Fused Lasso), #' \code{"2dflasso"} (2D Fused Lasso) or \code{"ggflasso"} (Graph-Guided Fused Lasso). #' @param n.par.cov Only for \code{glmsmurf.fit}: list with number of parameters to estimate per covariate (predictor). A named list of strictly positive integers with covariate (predictor) names as element names. #' @param group.cov Only for \code{glmsmurf.fit}: list with group of each covariate (predictor) which is only used for the Group Lasso penalty. #' A named list of positive integers with covariate names as element names where 0 means no group. #' @param refcat.cov Only for \code{glmsmurf.fit}: list with number of the reference category in the original order of the levels of each covariate (predictor). #' When the covariate (predictor) is not a factor or no reference category is present, it is equal to 0. This number will only be taken into account for a Fused Lasso, Generalized Fused Lasso or Graph-Guided Fused Lasso penalty #' @param n.par.cov Only for \code{glmsmurf.fit}: list with number of parameters to estimate per predictor (covariate). A named list of strictly positive integers with predictor names as element names. #' @param group.cov Only for \code{glmsmurf.fit}: list with group of each predictor (covariate) which is only used for the Group Lasso penalty. #' A named list of positive integers with predictor names as element names where 0 means no group. #' @param refcat.cov Only for \code{glmsmurf.fit}: list with number of the reference category in the original order of the levels of each predictor (covariate). #' When the predictor is not a factor or no reference category is present, it is equal to 0. This number will only be taken into account for a Fused Lasso, Generalized Fused Lasso or Graph-Guided Fused Lasso penalty #' when a reference category is present. glmsmurf.fit <- function(X, y, weights, start, offset, family, pen.cov, n.par.cov, group.cov, refcat.cov, lambda, lambda1 = 0, lambda2 = 0, pen.weights, adj.matrix, standardize = TRUE, control = list(), ... ... @@ -273,15 +273,15 @@ glmsmurf.fit <- function(X, y, weights, start, offset, family, pen.cov, n.par.co ###################################### # Standardize # Indices of variables with Lasso or Group Lasso penalty # Indices of predictors with Lasso or Group Lasso penalty ind.stand <- which(pen.cov %in% c("lasso", "grouplasso")) # Standardize X matrix for variables with Lasso or Group Lasso penalty (if standardize = TRUE) # Standardize X matrix for predictors with Lasso or Group Lasso penalty (if standardize = TRUE) list.stand <- .X.stand(X = X, standardize = standardize, ind.stand = ind.stand, n.par.cov = n.par.cov, weights = weights) # Possibly standardized X matrix X <- list.stand$X # Set attribute indicating if the variables for Lasso and Group Lasso are standardized # Set attribute indicating if the predictors for Lasso and Group Lasso are standardized attr(X, "standardize") <- standardize # Add X.means and X.sds as attributes of X attr(X, "X.means") <- list.stand$X.means ... ... @@ -305,7 +305,7 @@ glmsmurf.fit <- function(X, y, weights, start, offset, family, pen.cov, n.par.co # No change needed if reference category is not the first level since order does not matter! gflasso = .pen.mat.gflasso(n.par.cov[[j]], refcat = refcat), "2dflasso" = if(length(n.par.cov.2dflasso[[j]]) > 1) { # Second and third element are number of levels from 1D variables minus 1 # Second and third element are number of levels from 1D predictors minus 1 .pen.mat.2dflasso(n.par.cov.2dflasso[[j]][2], n.par.cov.2dflasso[[j]][3]) } else { .pen.mat.2dflasso(sqrt(n.par.cov.2dflasso[[j]]), sqrt(n.par.cov.2dflasso[[j]])) ... ... @@ -357,7 +357,7 @@ glmsmurf.fit <- function(X, y, weights, start, offset, family, pen.cov, n.par.co for (j in 1:n.cov) { pen.mat[[j]] <- pen.mat[[j]] * pen.weights[[j]] # Compute inverse of pen.mat for ordinal variables # Compute inverse of pen.mat for ordinal predictors if (pen.cov[[j]] == "flasso") { pen.mat.transform[[j]] <- t(upper.tri(pen.mat[[j]], diag=TRUE) * (0 * pen.mat[[j]] + 1) / pen.weights[[j]]) ... ...  ... ... @@ -155,12 +155,12 @@ } ###### # Indices of variables with penalty different from 2dflasso # Indices of predictors with penalty different from 2dflasso ind <- which(pen.cov %in% c("lasso", "grouplasso", "flasso", "gflasso", "ggflasso")) if (length(ind) > 0) { # Split beta.new vector based on variables # Split beta.new vector based on predictors beta.new.split <- split(beta.new, rep(1:n.cov, n.par.cov)) nms <- names(beta.new) ... ... @@ -175,7 +175,7 @@ if (!(pen.cov[[j]] %in% c("flasso", "gflasso", "ggflasso") & (lambda * lambda1.orig == 0 & lambda * lambda2.orig == 0))) { # If all coefficients for the variable are non-zero, and there is more than one coefficient, # If all coefficients for the predictor are non-zero, and there is more than one coefficient, # problems arise due to multicollinearity. This is solved by setting the first coefficient # (and all other coefficients which are equal to this one) to 0 to make the first (or the chosen) category the reference category ind.ref <- ifelse(refcat.cov[[j]] == 0, 1L, refcat.cov[[j]]) ... ... @@ -188,7 +188,7 @@ } if (warn > 0) { warning("Reference classes are introduced for some variable(s) in the re-estimated model.") warning("Reference classes are introduced for some predictor(s) in the re-estimated model.") } # Transform back to beta vector and use original names # Use new vector such that output for beta.new is not altered! ... ... @@ -229,7 +229,7 @@ # Re-estimated coefficients per group beta.reest <- glm.reest$coefficients beta.reest.long <- rep(0, length(beta.new2)) # Make new coefficient vector with re-estimated coefficients per variable # Make new coefficient vector with re-estimated coefficients per predictor for (i in 1:length(beta.reduced)) { par <- beta.reduced[i] beta.reest.long[which(beta.new2 == par)] <- beta.reest[i] ... ... @@ -245,7 +245,7 @@ ########### # Transform X and beta and beta.reest.long to original scale # Logical indicating if the variables for Lasso and Group Lasso are standardized # Logical indicating if the predictors for Lasso and Group Lasso are standardized if (!is.null(attr(X, "standardize"))) { standardize <- attr(X, "standardize") ... ... @@ -254,7 +254,7 @@ standardize <- FALSE } # Indices of variables with Lasso or Group Lasso penalty # Indices of predictors with Lasso or Group Lasso penalty ind.stand <- which(pen.cov %in% c("lasso", "grouplasso")) if (standardize & length(ind.stand) > 0) { ... ... @@ -267,11 +267,11 @@ X.sds <- attr(X, "X.sds") for (j in ind.stand) { # Loop over variables with Lasso or Group Lasso penalty # Loop over predictors with Lasso or Group Lasso penalty # Index of first coefficient corresponding to this variable # Index of first coefficient corresponding to this predictor ind.start <- ifelse(j == 1, 1L, sum(unlist(n.par.cov[1:(j-1L)])) + 1L)