Clean-up

parent cc37beee
Pipeline #19161196 passed with stages
in 18 minutes and 14 seconds
Package: claut Package: claut
Type: Package Type: Package
Title: Functions from the University of Toronto Climate Lab Title: Functions from the University of Toronto Climate Lab
Version: 0.1.2 Version: 0.1.3
Date: 2017-03-24 Date: 2018-03-19
Authors@R: c(person(given = c("Conor", "I."), family = "Anderson", Authors@R: c(person(given = c("Conor", "I."), family = "Anderson",
role = c("aut","cre"), email = "conor.anderson@mail.utoronto.ca"), role = c("aut","cre"), email = "conor.anderson@utoronto.ca"),
person(given = c("William", "A."), family = "Gough", role = "ths", person(given = c("William", "A."), family = "Gough", role = "ths",
email = "gough@utsc.utoronto.ca")) email = "gough@utsc.utoronto.ca"))
Maintainer: Conor I. Anderson <conor.anderson@mail.utoronto.ca> Maintainer: Conor I. Anderson <conor.anderson@utoronto.ca>
Description: Collection of functions developed at the University of Description: Collection of functions developed at the University of
Toronto Climate lab, often stylized as "Climate Lab at University of Toronto, Toronto Climate lab, often stylized as "Climate Lab at University of Toronto,
CL@UT". CL@UT".
Depends: Depends:
R (>= 3.1.0), R (>= 3.1.0)
PCICt
Imports: Imports:
doParallel,
dplyr, dplyr,
foreach,
gdata, gdata,
magrittr, iterators,
ncdf4, parallel,
ncdf4.helpers,
readr, readr,
stats, stats,
storr,
tibble, tibble,
utils, utils,
zoo zoo
......
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(calc_anoms)
export(dataEliminator)
export(dataEliminatorMassive)
export(dataEliminatorThorough)
export(deltaDTD) export(deltaDTD)
export(ensemble_means)
export(gcm_anomalies)
export(gcm_anomalies_par)
export(gcm_anomalies_par_cl)
export(gcm_anomalies_snow)
export(generate_wkt_csv) export(generate_wkt_csv)
export(get_gcm_files)
export(missing_data_lab) export(missing_data_lab)
export(parse_ASCII_grid) export(parse_ASCII_grid)
export(trimData) export(trimData)
importFrom(doParallel,registerDoParallel) importFrom(doParallel,registerDoParallel)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows) importFrom(dplyr,bind_rows)
importFrom(dplyr,count)
importFrom(dplyr,filter)
importFrom(dplyr,group_by) importFrom(dplyr,group_by)
importFrom(dplyr,group_indices) importFrom(dplyr,group_indices)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate) importFrom(dplyr,mutate)
importFrom(dplyr,select) importFrom(dplyr,select)
importFrom(dplyr,summarize) importFrom(dplyr,summarize)
importFrom(dplyr,summarize_all)
importFrom(foreach,"%do%") importFrom(foreach,"%do%")
importFrom(foreach,"%dopar%") importFrom(foreach,"%dopar%")
importFrom(foreach,foreach) importFrom(foreach,foreach)
importFrom(gdata,unmatrix) importFrom(gdata,unmatrix)
importFrom(iterators,icount) importFrom(iterators,icount)
importFrom(lubridate,dmy)
importFrom(magrittr,"%>%")
importFrom(ncdf4,nc_close)
importFrom(ncdf4,nc_open)
importFrom(ncdf4,ncvar_get)
importFrom(ncdf4.helpers,get.split.filename.cmip5)
importFrom(ncdf4.helpers,nc.get.time.series)
importFrom(ncdf4.helpers,nc.get.var.subset.by.axes)
importFrom(parallel,detectCores) importFrom(parallel,detectCores)
importFrom(parallel,makeCluster) importFrom(parallel,makeCluster)
importFrom(parallel,stopCluster) importFrom(parallel,stopCluster)
...@@ -47,12 +24,7 @@ importFrom(readr,read_table) ...@@ -47,12 +24,7 @@ importFrom(readr,read_table)
importFrom(readr,write_csv) importFrom(readr,write_csv)
importFrom(stats,aggregate) importFrom(stats,aggregate)
importFrom(stats,sd) importFrom(stats,sd)
importFrom(storr,storr_rds)
importFrom(tibble,add_column)
importFrom(tibble,as_tibble)
importFrom(tibble,has_name)
importFrom(tibble,tibble) importFrom(tibble,tibble)
importFrom(tidyr,unnest)
importFrom(utils,count.fields) importFrom(utils,count.fields)
importFrom(utils,read.csv) importFrom(utils,read.csv)
importFrom(utils,setTxtProgressBar) importFrom(utils,setTxtProgressBar)
......
#' Calculate the anomaly values for a table produced by gcm_anomalies()
#'
#' @param datin a \code{tbl_df} produced by gcm_anomalies().
#'
#' @importFrom dplyr filter
#' @importFrom tibble has_name
#' @export
calc_anoms <- function(datin) {
dat_proj <- filter(datin, Scenario != "historical")
dat_hist <- filter(datin, Scenario == "historical")
if (has_name(datin, "Ensembles")) {
stop("It seems you already averaged the ensembles. This function needs to be run first.")
}
for (i in 1:nrow(dat_hist)) {
dat_proj[dat_proj$Model == dat_hist$Model[i] & dat_proj$Variable == dat_hist$Variable[i] & dat_proj$Ensemble == dat_hist$Ensemble[i], 5] <- unlist(dat_hist[i, 5])
for (col in 6:ncol(dat_proj)) {
rows <- which(dat_proj$Model == dat_hist$Model[i] & dat_proj$Variable == dat_hist$Variable[i] & dat_proj$Ensemble == dat_hist$Ensemble[i])
for (row in rows) {
dat_proj[row, col] <- dat_proj[row, col] - dat_hist[i, 5]
}
}
}
dat_proj[!is.na(dat_proj[[5]]),]
}
#' Eliminate daily values from a monthly data set
#'
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
#' @param simplify Boolean; whether to return simplified results
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'
dataEliminator <- function(month, csv, NAs, sampling = "z", variables = c("max", "min", "mean"), simplify = FALSE, interpolate = FALSE) {
## First, I hold the users hand, making sure that we have all of the information we need to perform the test.
# Set up the data.frame `month`
if (missing(month)) {
if (missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
month <- read.csv(csv)
}
# Get the range of `k` and sampling type
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
while (sampling != "c" & sampling != "r") {
sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
}
for (var in seq_along(variables)) {
variables[var] <- as.numeric(grep(variables[var], names(month), ignore.case = TRUE))
}
variables <- as.numeric(variables)
# We make an empty data frame where we will store our results
df <- data.frame(stringsAsFactors = FALSE)
# Now we start an outer loop, for each value of `k`
for (k in NAs) {
# First we generate our random data points that will be nullified
if (sampling == "r") {
NAvec <- sample(1:nrow(month), size = k, replace = FALSE)
}
if (sampling == "c") {
NAvec <- sample(1:(nrow(month)+1-k), size = 1, replace = FALSE)
NAvec <- seq(NAvec, NAvec-1+k)
}
# We do the nullifying
monthMod <- month
monthMod[NAvec,] <- NA
# Now we start an inner loop that does all the calculations and builds the results table
for (var in variables) {
colname <- colnames(month[var])
mean <- mean(month[[var]], na.rm = T)
SD <- sd(month[[var]], na.rm = T)
meanMod <- mean(monthMod[[var]], na.rm = T)
deltaMean <- abs(meanMod-mean)
prop <- deltaMean/SD
if (interpolate) {
monthApprox <- na.approx(monthMod[[var]])
prop <- c(prop, abs(mean(monthApprox, na.rm = T)-mean)/SD)
}
# Then we stick it all together in a table.
row <- c(colname, k, mean, SD, meanMod,deltaMean, prop)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
}
if (interpolate) {
names(prop) <- c("Mean.Error", "Mean.Error.Approx")
names(df) <- c("Variable", "No.NA", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
# If we chose the simplify option, we just get the binary result. This is useful for running the test a massive number of times, but only for a single variable. See the appendix!
if (simplify) {
return(prop)
} else {
return(df)
}
}
#' Perform the mass elimination of daily values from a monthly data set
#'
#' @param numberTests integer; the number of times to repeat the test
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variable character; the name of the variable to test (we will try to auto-idenify the column number)
#' @param verbose Boolean; whether the function should be verbose
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom utils read.csv
#'
#' @export
#'
dataEliminatorMassive <- function(numberTests, month, csv, NAs, sampling = "z", variable = c("max", "min", "mean"), verbose = FALSE, interpolate = FALSE) {
#source("dataEliminator.R")
# We again hold the users hand to come up with all the info we need
if (missing(numberTests)) numberTests <- as.numeric(readline(prompt = "How many times should I repeat the test? "))
if (missing(month) && missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
if (missing(month) && !missing(csv)) month <- read.csv(csv)
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
while (sampling != "c" & sampling != "r") {
sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
}
while (length(variable) > 1 | (variable != "mean" && variable != "max" && variable != "min")) {
print("Please choose the variable to test.")
variable <- readline(prompt = "Enter one of \"max\", \"min\", or \"mean\". ")
}
# We create an empty data frame to put our results
df <- data.frame(stringsAsFactors = FALSE)
colname <- colnames(month[grep(variable, names(month), ignore.case = TRUE)])
# We loop through the values of `k`
for (k in NAs) {
if (verbose == TRUE) print(paste("Running", numberTests, "tests with", k, "NAs."))
# We now run the original dataEliminator script, `numberTests` times, using the `simplify argument`.
result <- replicate(numberTests, dataEliminator(month = month, NAs = k, sampling = sampling, variables = variable, simplify = TRUE, interpolate = interpolate))
# Then we make a table of the results, proportion of tests passed overall, and broken down by test type.
if (interpolate) {
sta <- c(mean(result[1,]), mean(result[2,]))
} else {
sta <- summary(result)
}
row <- c(colname, k, numberTests, sta)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
if (interpolate) {
names(df) <- c("Variable", "No.NA", "No.Reps", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "No.Reps", "Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max")
}
return(df)
}
#' Perform the mass elimination of consecutive daily values from a monthly data set
#'
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
#' @param simplify Boolean; whether to return simplified results
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'
dataEliminatorThorough <- function(month, csv, NAs, variables = c("max", "min", "mean"), simplify = FALSE, interpolate = FALSE) {
## This version of the function is for consecutive sampling, and will test all possible
## permutations of a month with `1` missing values
## First, I hold the users hand, making sure that we have all of the information we need
## to perform the test.
# Set up the data.frame `month`
if (missing(month) && missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
if (missing(month) && !missing(csv)) month <- read.csv(csv)
# Get the range of `k` and sampling type
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
# Find out which variables we are interested in
for (var in seq_along(variables)) {
variables[var] <- as.numeric(grep(variables[var], names(month), ignore.case = TRUE))
}
variables <- as.numeric(variables)
# We make an empty data frame where we will store our results
df2 <- df3 <- data.frame(stringsAsFactors = FALSE)
# Now we start an outer loop, for each value of `k`
for (k in NAs) {
df <- data.frame(stringsAsFactors = FALSE)
# Now we start an inner loop, which will loop through dates from the 1st to the (n+1-k)th
# Note, please ignore the fact that my loop function variables are different letters than above!
for (days in 1:(nrow(month)+1-k)) {
# Choose which days to eliminate
NAs <- seq(days, days-1+k)
# We do the nullifying
monthMod <- month
monthMod[NAs,] <- NA
# Now we start an inner loop that does all the calculations and builds the results table
for (var in variables) {
colname <- colnames(month[var])
month[[var]] <- as.numeric(month[[var]])
mean <- mean(month[[var]], na.rm = TRUE)
SD <- sd(month[[var]], na.rm = T)
meanMod <- mean(monthMod[[var]], na.rm = TRUE)
deltaMean <- abs(meanMod-mean)
prop <- deltaMean/SD
if (interpolate) {
monthApprox <- na.approx(monthMod[[var]])
prop <- c(prop, abs(mean(monthApprox, na.rm = TRUE)-mean)/SD)
}
# Then we stick it all together in a table.
row <- c(colname, k, NAs[1], NAs[length(NAs)], mean, SD, meanMod,deltaMean, prop)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
}
if (interpolate) {
names(df) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
df3 <- rbind(df3, df)
if (interpolate) {
names(df3) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df3) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
# Yet another loop (Optimization be damned!) that takes the summary stats for each variable, in
# case we want a simple summary.
for (var in variables) {
colname <- colnames(month[var])
index <- which(df$Variable == colname)
if (interpolate) {
sta <- c(mean(as.numeric(df$Mean.Error[index])), mean(as.numeric(df$Mean.Error.Approx[index])))
} else {
sta <- summary(as.numeric(df$Mean.Error[index]))
}
row <- c(colname, k, length(1:(nrow(month)+1-k)), sta)
df2 <- rbind(df2, row, stringsAsFactors = FALSE)
}
if (interpolate) {
names(df2) <- c("Variable", "No.NA", "No.Reps", "Mean.Error", "Mean.Error.Approx")
} else {
names(df2) <- c("Variable", "No.NA", "No.Reps", "Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max")
}
}
# If we chose the simplify option, we just get the summary results.
if (simplify) {
return(df2)
} else {
return(df3)
}
}
#' Calculate the average of all ensembles for the same Variable/Model/Scenario
#'
#' @param datin a \code{tbl_df} produced by gcm_anomalies().
#'
#' @importFrom dplyr group_by left_join select summarize summarize_all
#' @importFrom magrittr %>%
#' @export
ensemble_means <- function(datin) {
ensembles <- datin %>% group_by(Variable, Model, Scenario) %>% summarize(Ensembles = list(unique(Ensemble)))
summaries <- datin %>% select(-4) %>% group_by(Variable, Model, Scenario) %>% summarize_all(mean)
datout <- left_join(ensembles, summaries)
datout
}
#' Generate table GCM anomaly data
#'
#' @param dir character; path to the directory containing the \code{.nc} files you wish to process
#' @param files character; list of filenames to process. Any list item of >1 will be processed sequentially.
#' @param lat numeric; the latitude of the target cell
#' @param lon numeric; the longitude of the target cell
#' @param timeslice numeric; 1-16: annual (0), monthly (1-12), seasonal (13-16)
#' @param baseline the baseline on which the check the data (defaults to 1971:2000)
#' @param start projection start year (defaults to 2011)
#' @param simple_names Boolean; whether to simplify the projection interval names (ie. 2020s instead of 2011-2040)
#' @param calc_anom Boolean; whether to calculate anomaly values.
#' @param ensemble_average Boolean; whether to average different ensembles for the same variable/model/scenario.
#' @param cache_dir character; a directory to cache results (defaults to NULL, i.e. no cache.)
#'
#' @return a class \code{data.frame}
#' @export
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr arrange filter group_by summarize
#' @importFrom ncdf4 nc_close nc_open ncvar_get
#' @importFrom ncdf4.helpers get.split.filename.cmip5 nc.get.time.series nc.get.var.subset.by.axes
#' @importFrom storr storr_rds
#' @importFrom tibble as_tibble tibble
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom zoo as.yearmon
#'
#' @examples
#' \dontrun{annual <- gcm_anomalies(getwd(), -7.023833, -76.465222, 0, 1971:2000, calc_anom = TRUE)}
gcm_anomalies <- function(dir = getwd(), files, lat, lon, timeslice = 0, baseline, start = 2011, simple_names = FALSE, calc_anom = TRUE, ensemble_average = TRUE, cache_dir = NULL) {
if(!is.null(cache_dir)) {
use_storr = TRUE
sto <- storr_rds(cache_dir)
}
## Generate projection periods based on baseline length and start year
number_of_periods <- (2100 - start + 1) %/% length(baseline)
proj <- list()
period_names <- rep(NA, number_of_periods)
for (per in 1:number_of_periods) {
st <- start + (length(baseline) * (per - 1))
ed <- start + (length(baseline) * per) - 1
period <- list(st:ed)
period_names[per] <- if(simple_names) paste0(mean(unlist(period)) %/% 10 * 10, "s") else paste0(st, "-", ed)
proj <- c(proj, period)
}
## Prepare a table with the right dimensions
dat <- as_tibble(matrix(NA, nrow = length(files), ncol = 5 + number_of_periods))
# Process each NetCDF file
print("Processing the NetCDF files...")
## Set up a progress Bar
prog <- txtProgressBar(min = 0, max = length(files), style = 3)
on.exit(close(prog))
for (file_list in seq_along(files)) {
fs <- unlist(files[file_list])
for (f in 1:length(fs)) {
components <- get.split.filename.cmip5(file.path(dir, fs[f]))
var <- unname(components['var'])
model <- unname(components['model'])
scenario <- unname(components['emissions'])
ensemble <- unname(components['run'])
nc_nc <- nc_open(file.path(dir, fs[f]))
if (f == 1) {
# Get the bounds of each cell
lon_bnds <- try(ncvar_get(nc_nc, "lon_bnds"), silent = TRUE)
if (inherits(lon_bnds, "try-error")) {
lon_bnds <- ncvar_get(nc_nc, "lon_bounds")
}
lat_bnds <- try(ncvar_get(nc_nc, "lat_bnds"), silent = TRUE)
if (inherits(lat_bnds, "try-error")) {
lat_bnds <- ncvar_get(nc_nc, "lat_bounds")
}
# Convert Western longitudes to degrees East
lon <- ifelse(lon <0, 360 + lon, lon)
# Get the grid cell we are interested in
lat_index <- which(lat_bnds[1,] <= lat & lat_bnds[2,] >= lat)
lon_index <- which(lon_bnds[1,] <= lon & lon_bnds[2,] >= lon)
rm(lat_bnds, lon_bnds); gc()
}
# Check the cache
key <- paste(fs[f], paste(lat_index, collapse = "-"), paste(lon_index, collapse = "-"), sep = "_")
time_tmp <- NULL
if (use_storr) {
if (sto$exists(key)) {
print("Hit the cache")
time_tmp <- sto$get(key)
}
}
if (is.null(time_tmp)) {
print("Missed the cache")
# Get the time
nc_time <- nc.get.time.series(nc_nc, v = var, time.dim.name = "time")
nc_time <- as.yearmon(format(nc_time, format = "%Y-%m-%d hh:mm:ss"))
# Now load only that grid data
nc_var <- nc.get.var.subset.by.axes(nc_nc, var, axis.indices = list(X = lon_index, Y = lat_index))
# Close the nc connection
nc_close(nc_nc); rm(nc_nc)
if (dim(nc_var)[1] > 1 || dim(nc_var)[2] > 1) {
warning("We got two latitud or longitude cells. We'll average across them.")
nc_var <- apply(nc_var, 3, mean)
} else {
nc_var <- nc_var[1, 1, ]
}
if (var %in% c("tas", "tasmax", "tasmin")) {
nc_var <- nc_var - 273.15
} else {
stop("Sorry, for now I only work with temperature!")
}
time_tmp <- tibble(Time = nc_time, Year = format(as.yearmon(Time), format = "%Y"),
Month = format(as.yearmon(Time), format = "%m"),
Var = nc_var)
rm(nc_var, nc_time); gc()
if(use_storr) sto$set(key, time_tmp)
}
if (f == 1) {
time_series <- time_tmp
} else {
time_series <- suppressWarnings(bind_rows(time_series, time_tmp))
}
}
if (timeslice > 0 & timeslice < 13) {
time_series <- filter(time_series, Month == sprintf("%02d", timeslice))
} else if (timeslice > 12 & timeslice < 17) {
if (timeslice == 13) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(1, 2, 12)))
time_series$Year[time_series$Month == sprintf("%02d", 12)] <- as.numeric(time_series$Year[time_series$Month == sprintf("%02d", 12)]) + 1
}
if (timeslice == 14) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(3, 4, 5)))
}
if (timeslice == 15) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(6, 7, 8)))
}
if (timeslice == 16) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(9, 10, 11)))
}
}
result <- time_series %>% group_by(Year) %>% summarize(Mean = mean(Var))
result$Year <- as.numeric(result$Year)
periods <- rep(NA, length(proj))
if (scenario == "historical") {
baseln <- mean(result$Mean[result$Year >= min(baseline) & result$Year <= max(baseline)])
} else {
baseln <- NA
for (i in seq_along(proj)) {
periods[i] <- mean(result$Mean[result$Year >= min(proj[[i]]) & result$Year <= max(proj[[i]])])
}
}
col_names <- c("Variable", "Model", "Scenario", "Ensemble", paste0("Baseline ", min(baseline), "-", max(baseline)), period_names)
row <- list(var, model, scenario, ensemble, baseln)
row <- c(row, periods)
names(row) <- col_names
dat[file_list,] <- row
colnames(dat) <- col_names
setTxtProgressBar(prog, value = file_list)
}
# Compute additional metrics
if (calc_anom) dat <- calc_anoms(dat)
if (ensemble_average) dat <- ensemble_means(dat)
# Return our tibble
dat %>% arrange(Model, Scenario, Ensemble)
}
#' Generate a list of GCM files to pass to \code{gcm_anomalies()}
#'
#' @param dir character; the directory containing the files.
#' @param baseline numeric; a vector years that should be covered by history (\code{NULL} to ignore)
#' @param projection numeric; a vector years that should be covered by the models (\code{NULL} to ignore)
#' @param scenario character; the scenario to filter on (optional)
#' @param ensemble character; the ensemble to fulter on (optional)
#'
#' @return a list of filenames
#'
#' @importFrom tibble add_column as_tibble
#' @importFrom dplyr count filter
#' @importFrom magrittr %>%
#' @importFrom ncdf4.helpers get.split.filename.cmip5
#' @importFrom zoo as.yearmon
#'
#' @export
#'
#' @examples
#' \dontrun{get_gcm_files(getwd(), 1971:2000, scenario = c("rcp85", "rcp45", "rcp26", "rcp60"), ensemble = "r1i1p1")}
get_gcm_files <- function(dir, baseline = NULL, projection = NULL, scenario = NULL, ensemble = NULL) {
# If we want both the baseline and the projections, we'll run this twice, once for each period.
if (!is.null(baseline) && !is.null(projection)) {
baseline_files <- get_gcm_files(dir, baseline, NULL, scenario, ensemble)
projection_files <- get_gcm_files(dir, NULL, projection, scenario, ensemble)
return(append(baseline_files, projection_files))
}
if (is.null(baseline)) period <- projection
if (is.null(projection)) {
period <- baseline
# There are no experiments in the past.
scenario <- "historical"
}
# Get the available files
filenames <- dir(dir)
choices <- as_tibble(t(sapply(filenames, get.split.filename.cmip5)))
names(choices) <- c("Variable", "Period", "Model", "Scenario", "Ensemble", "Range", "Start", "End")
choices <- choices %>% add_column(Filenames = filenames)
# Narrow down our choices
filtered <- choices
if (!is.null(scenario)) filtered <- filtered %>% filter(Scenario %in% scenario)
if (!is.null(ensemble)) filtered <- filtered %>% filter(Ensemble %in% ensemble)
# A lot of useful model output is split into multiple files
multi_models <- filtered %>% group_by(Scenario, Ensemble) %>% count(Model) %>% filter(n > 1)
groupings <- rep(list(NA), nrow(multi_models))
for (row in 1:nrow(multi_models)){
tab <- filtered %>% filter(Model == multi_models$Model[row], Scenario == multi_models$Scenario[row], Ensemble == multi_models$Ensemble[row])
# If any file for this model gives us all the data we need, we'll skip this row
if (nrow(tab[tab$Start <= paste0(period[1], "01") & tab$End >= paste0(period[length(period)], "12"),]) > 0) {
next
}
# If the data starts too late or ends too soon, we'll skip this row
if (max(tab$End) < paste0(period[length(period)], "12")) {
next
}
if (min(tab$Start) > paste0(period[1], "01")) {