add subensemble()

parent a089bd98
Pipeline #21199271 passed with stages
in 19 minutes and 59 seconds
Package: claut
Type: Package
Title: Functions from the University of Toronto Climate Lab
Version: 0.1.3
Date: 2018-03-19
Version: 0.1.4
Date: 2018-04-17
Authors@R: c(person(given = c("Conor", "I."), family = "Anderson",
role = c("aut","cre"), email = "conor.anderson@utoronto.ca"),
person(given = c("William", "A."), family = "Gough", role = "ths",
......@@ -19,6 +19,7 @@ Imports:
foreach,
gdata,
iterators,
lpSolve,
parallel,
readr,
stats,
......
......@@ -4,6 +4,7 @@ export(deltaDTD)
export(generate_wkt_csv)
export(missing_data_lab)
export(parse_ASCII_grid)
export(subensemble)
export(trimData)
importFrom(doParallel,registerDoParallel)
importFrom(dplyr,bind_rows)
......@@ -17,6 +18,7 @@ importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(gdata,unmatrix)
importFrom(iterators,icount)
importFrom(lpSolve,lp)
importFrom(parallel,detectCores)
importFrom(parallel,makeCluster)
importFrom(parallel,stopCluster)
......
#' Find optimal sub-ensembles
#'
#' @param k integer; the sub-ensemble size(s) to test
#' @param datain numeric; a vector or data.frame containing the GCM anomalies for each baseline series
#' @param target numeric; a vector of the baseline averaegs to test against
#' @param stdev numeric; the standard deviation of the annual average baseline value to test against
#' @param model_names Boolean; whether the datain includes a column of model names
#' @param single_result Boolean; whether to return one optimal sub-ensemble size
#' @param parallel Boolean; whether to perform operations in parallel
#' @param no_cores integer; the numer of CPU cores to use for parallel processing
#'
#' @return a data.frame with one row for each sub-ensemble size
#'
#' @importFrom foreach %do% %dopar% foreach
#' @importFrom lpSolve lp
#'
#' @export
#'
subensemble = function(k, datain, target, stdev = NULL, model_names = TRUE, single_result = FALSE, parallel = FALSE, no_cores = parallel::detectCores() - 1) {
# If the data is passed as a vector, we coerce it to a data.frame
if (!inherits(datain, "data.frame")) {
datain <- as.data.frame(datain)
model_names <- FALSE
}
# If `k` is not provided, try all levels
if (missing(k)) {
k <- 1:nrow(datain)
}
# If `k` exceeds the number of models, throw an error
if (max(k) > nrow(datain)) {
stop("You have requested more models than are present in the data set.")
}
# Drop the model names if the column was included
dat <- if (model_names) datain[2:ncol(datain)] else datain
# Throw an error if we have more variables than we have targets
if (ncol(dat) != length(target)) {
stop("The number of columns does not match the number of targets!")
}
n = nrow(dat)
if (parallel) registerDoParallel(no_cores)
`%infix%` <- ifelse(parallel, `%dopar%`, `%do%`)
result <- foreach(lvl_k = k, .combine = rbind) %infix% {
objective.in = c(rep(0, n), lvl_k)
for (i in 1:ncol(dat)) {
if (i == 1) {
const.mat <- rbind(c(dat[[i]],-1), c(dat[[i]],+1))
} else {
const.mat <- rbind(const.mat, c(dat[[i]],-1), c(dat[[i]],+1))
}
}
const.mat <- rbind(const.mat, c(rep(1,n),0))
const.dir <- c(rep(c("<",">"), ncol(dat)),"=")
const.rhs <- NULL
for (i in 1:ncol(dat)) {
const.rhs <- c(const.rhs, target[i]*lvl_k, target[i]*lvl_k)
}
const.rhs <- c(const.rhs, lvl_k)
v <- lp("min", objective.in, const.mat, const.dir, const.rhs, binary.vec = 1:n)
a <- v$solution[1:n]
residuals <- GFs <- NULL
for (i in 1:ncol(dat)) {
residual <- sum(dat[i]*a)/sum(a)-target[i]
residuals <- c(residuals, residual)
names(residuals)[i] <- paste("Res.", names(dat[i]))
if(!is.null(stdev)) {
GF <- abs(residual)/stdev[i]
GFs <- c(GFs, GF)
names(GFs)[i] <- paste("GF", names(dat[i]))
}
}
if(ncol(dat) >1 && !is.null(stdev)) {
avgGF <- mean(GFs)
names(avgGF)[1] <- paste("Avg. GF")
} else {
avgGF <- NULL
}
row <- as.data.frame(t(c(k = lvl_k, residuals, GFs, avgGF, a = a)))
row
}
if (single_result) {
if ('Avg. GF' %in% colnames(result)) {
result[which(result$`Avg. GF` == min(result$`Avg. GF`)),]
} else {
result[which(result[3] == min(result[3])),]
}
} else {
result
}
}
\ No newline at end of file
......@@ -26,6 +26,7 @@ The following functions were used in studies that are currently under review. If
- [`deltaDTD()`](https://gitlab.com/ConorIA/claut/blob/master/R/deltaDTD.R): Calculate a number of measures of diurnal temperature variability.
- [`missing_value_lab()`](https://gitlab.com/ConorIA/claut/blob/master/R/missing_value_lab.R): A function to artificially introduce missing values into monthly climate series, replace those missing values with linear interpolation and cubic splines, and then calculate the error in the calculated mean.
- [`subensembles()`](https://gitlab.com/ConorIA/claut/blob/master/R/missing_value_lab.R): Takes a table of GCM anomalies generated by the [_Conjuntool_](https://gitlab.com/ConorIA/conjuntool) and provides optimal sub-ensembles based on the ability of a sub-ensemble of size $k$ to reproduce an observed baseline.
## Misc functions
......@@ -38,7 +39,7 @@ There are some other helper functions in this package that are here in hopes tha
## Other resources
Some of the functions that _were_ contained in the package have been moved to my [Conjuntool project](https://gitlab.com/ConorIA/conjuntool).
Some of the functions that _were_ contained in the package have been moved to my [_Conjuntool_ project](https://gitlab.com/ConorIA/conjuntool).
For other resources from the climate lab and beyond, please see our [CL@UT Resources List](https://gitlab.com/ConorIA/claut-resources).
## Contributing to the `claut` package
......
......@@ -44,7 +44,13 @@ code changes, but the results that they produce will always be the
A function to artificially introduce missing values into monthly
climate series, replace those missing values with linear
interpolation and cubic splines, and then calculate the error in the
calculated mean.
calculated
mean.
- [`subensembles()`](https://gitlab.com/ConorIA/claut/blob/master/R/missing_value_lab.R):
Takes a table of GCM anomalies generated by the
[*Conjuntool*](https://gitlab.com/ConorIA/conjuntool) and provides
optimal sub-ensembles based on the ability of a sub-ensemble of size
\(k\) to reproduce an observed baseline.
## Misc functions
......@@ -69,9 +75,10 @@ hopes that they prove useful to someone someday. These
## Other resources
Some of the functions that *were* contained in the package have been
moved to my [Conjuntool project](https://gitlab.com/ConorIA/conjuntool).
For other resources from the climate lab and beyond, please see our
[CL@UT Resources List](https://gitlab.com/ConorIA/claut-resources).
moved to my [*Conjuntool*
project](https://gitlab.com/ConorIA/conjuntool). For other resources
from the climate lab and beyond, please see our [CL@UT Resources
List](https://gitlab.com/ConorIA/claut-resources).
## Contributing to the `claut` package
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/subensemble.R
\name{subensemble}
\alias{subensemble}
\title{Find optimal sub-ensembles}
\usage{
subensemble(k, datain, target, stdev = NULL, model_names = TRUE,
single_result = FALSE, parallel = FALSE,
no_cores = parallel::detectCores() - 1)
}
\arguments{
\item{k}{integer; the sub-ensemble size(s) to test}
\item{datain}{numeric; a vector or data.frame containing the GCM anomalies for each baseline series}
\item{target}{numeric; a vector of the baseline averaegs to test against}
\item{stdev}{numeric; the standard deviation of the annual average baseline value to test against}
\item{model_names}{Boolean; whether the datain includes a column of model names}
\item{single_result}{Boolean; whether to return one optimal sub-ensemble size}
\item{parallel}{Boolean; whether to perform operations in parallel}
\item{no_cores}{integer; the numer of CPU cores to use for parallel processing}
}
\value{
a data.frame with one row for each sub-ensemble size
}
\description{
Find optimal sub-ensembles
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment