Clean-up

parent cc37beee
Pipeline #19161196 passed with stages
in 18 minutes and 14 seconds
Package: claut
Type: Package
Title: Functions from the University of Toronto Climate Lab
Version: 0.1.2
Date: 2017-03-24
Version: 0.1.3
Date: 2018-03-19
Authors@R: c(person(given = c("Conor", "I."), family = "Anderson",
role = c("aut","cre"), email = "conor.anderson@mail.utoronto.ca"),
role = c("aut","cre"), email = "conor.anderson@utoronto.ca"),
person(given = c("William", "A."), family = "Gough", role = "ths",
email = "gough@utsc.utoronto.ca"))
Maintainer: Conor I. Anderson <conor.anderson@mail.utoronto.ca>
Maintainer: Conor I. Anderson <conor.anderson@utoronto.ca>
Description: Collection of functions developed at the University of
Toronto Climate lab, often stylized as "Climate Lab at University of Toronto,
CL@UT".
Depends:
R (>= 3.1.0),
PCICt
R (>= 3.1.0)
Imports:
doParallel,
dplyr,
foreach,
gdata,
magrittr,
ncdf4,
ncdf4.helpers,
iterators,
parallel,
readr,
stats,
storr,
tibble,
utils,
zoo
......
# Generated by roxygen2: do not edit by hand
export(calc_anoms)
export(dataEliminator)
export(dataEliminatorMassive)
export(dataEliminatorThorough)
export(deltaDTD)
export(ensemble_means)
export(gcm_anomalies)
export(gcm_anomalies_par)
export(gcm_anomalies_par_cl)
export(gcm_anomalies_snow)
export(generate_wkt_csv)
export(get_gcm_files)
export(missing_data_lab)
export(parse_ASCII_grid)
export(trimData)
importFrom(doParallel,registerDoParallel)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows)
importFrom(dplyr,count)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,group_indices)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,select)
importFrom(dplyr,summarize)
importFrom(dplyr,summarize_all)
importFrom(foreach,"%do%")
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(gdata,unmatrix)
importFrom(iterators,icount)
importFrom(lubridate,dmy)
importFrom(magrittr,"%>%")
importFrom(ncdf4,nc_close)
importFrom(ncdf4,nc_open)
importFrom(ncdf4,ncvar_get)
importFrom(ncdf4.helpers,get.split.filename.cmip5)
importFrom(ncdf4.helpers,nc.get.time.series)
importFrom(ncdf4.helpers,nc.get.var.subset.by.axes)
importFrom(parallel,detectCores)
importFrom(parallel,makeCluster)
importFrom(parallel,stopCluster)
......@@ -47,12 +24,7 @@ importFrom(readr,read_table)
importFrom(readr,write_csv)
importFrom(stats,aggregate)
importFrom(stats,sd)
importFrom(storr,storr_rds)
importFrom(tibble,add_column)
importFrom(tibble,as_tibble)
importFrom(tibble,has_name)
importFrom(tibble,tibble)
importFrom(tidyr,unnest)
importFrom(utils,count.fields)
importFrom(utils,read.csv)
importFrom(utils,setTxtProgressBar)
......
#' Calculate the anomaly values for a table produced by gcm_anomalies()
#'
#' @param datin a \code{tbl_df} produced by gcm_anomalies().
#'
#' @importFrom dplyr filter
#' @importFrom tibble has_name
#' @export
calc_anoms <- function(datin) {
dat_proj <- filter(datin, Scenario != "historical")
dat_hist <- filter(datin, Scenario == "historical")
if (has_name(datin, "Ensembles")) {
stop("It seems you already averaged the ensembles. This function needs to be run first.")
}
for (i in 1:nrow(dat_hist)) {
dat_proj[dat_proj$Model == dat_hist$Model[i] & dat_proj$Variable == dat_hist$Variable[i] & dat_proj$Ensemble == dat_hist$Ensemble[i], 5] <- unlist(dat_hist[i, 5])
for (col in 6:ncol(dat_proj)) {
rows <- which(dat_proj$Model == dat_hist$Model[i] & dat_proj$Variable == dat_hist$Variable[i] & dat_proj$Ensemble == dat_hist$Ensemble[i])
for (row in rows) {
dat_proj[row, col] <- dat_proj[row, col] - dat_hist[i, 5]
}
}
}
dat_proj[!is.na(dat_proj[[5]]),]
}
#' Eliminate daily values from a monthly data set
#'
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
#' @param simplify Boolean; whether to return simplified results
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'
dataEliminator <- function(month, csv, NAs, sampling = "z", variables = c("max", "min", "mean"), simplify = FALSE, interpolate = FALSE) {
## First, I hold the users hand, making sure that we have all of the information we need to perform the test.
# Set up the data.frame `month`
if (missing(month)) {
if (missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
month <- read.csv(csv)
}
# Get the range of `k` and sampling type
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
while (sampling != "c" & sampling != "r") {
sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
}
for (var in seq_along(variables)) {
variables[var] <- as.numeric(grep(variables[var], names(month), ignore.case = TRUE))
}
variables <- as.numeric(variables)
# We make an empty data frame where we will store our results
df <- data.frame(stringsAsFactors = FALSE)
# Now we start an outer loop, for each value of `k`
for (k in NAs) {
# First we generate our random data points that will be nullified
if (sampling == "r") {
NAvec <- sample(1:nrow(month), size = k, replace = FALSE)
}
if (sampling == "c") {
NAvec <- sample(1:(nrow(month)+1-k), size = 1, replace = FALSE)
NAvec <- seq(NAvec, NAvec-1+k)
}
# We do the nullifying
monthMod <- month
monthMod[NAvec,] <- NA
# Now we start an inner loop that does all the calculations and builds the results table
for (var in variables) {
colname <- colnames(month[var])
mean <- mean(month[[var]], na.rm = T)
SD <- sd(month[[var]], na.rm = T)
meanMod <- mean(monthMod[[var]], na.rm = T)
deltaMean <- abs(meanMod-mean)
prop <- deltaMean/SD
if (interpolate) {
monthApprox <- na.approx(monthMod[[var]])
prop <- c(prop, abs(mean(monthApprox, na.rm = T)-mean)/SD)
}
# Then we stick it all together in a table.
row <- c(colname, k, mean, SD, meanMod,deltaMean, prop)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
}
if (interpolate) {
names(prop) <- c("Mean.Error", "Mean.Error.Approx")
names(df) <- c("Variable", "No.NA", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
# If we chose the simplify option, we just get the binary result. This is useful for running the test a massive number of times, but only for a single variable. See the appendix!
if (simplify) {
return(prop)
} else {
return(df)
}
}
#' Perform the mass elimination of daily values from a monthly data set
#'
#' @param numberTests integer; the number of times to repeat the test
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variable character; the name of the variable to test (we will try to auto-idenify the column number)
#' @param verbose Boolean; whether the function should be verbose
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom utils read.csv
#'
#' @export
#'
dataEliminatorMassive <- function(numberTests, month, csv, NAs, sampling = "z", variable = c("max", "min", "mean"), verbose = FALSE, interpolate = FALSE) {
#source("dataEliminator.R")
# We again hold the users hand to come up with all the info we need
if (missing(numberTests)) numberTests <- as.numeric(readline(prompt = "How many times should I repeat the test? "))
if (missing(month) && missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
if (missing(month) && !missing(csv)) month <- read.csv(csv)
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
while (sampling != "c" & sampling != "r") {
sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
}
while (length(variable) > 1 | (variable != "mean" && variable != "max" && variable != "min")) {
print("Please choose the variable to test.")
variable <- readline(prompt = "Enter one of \"max\", \"min\", or \"mean\". ")
}
# We create an empty data frame to put our results
df <- data.frame(stringsAsFactors = FALSE)
colname <- colnames(month[grep(variable, names(month), ignore.case = TRUE)])
# We loop through the values of `k`
for (k in NAs) {
if (verbose == TRUE) print(paste("Running", numberTests, "tests with", k, "NAs."))
# We now run the original dataEliminator script, `numberTests` times, using the `simplify argument`.
result <- replicate(numberTests, dataEliminator(month = month, NAs = k, sampling = sampling, variables = variable, simplify = TRUE, interpolate = interpolate))
# Then we make a table of the results, proportion of tests passed overall, and broken down by test type.
if (interpolate) {
sta <- c(mean(result[1,]), mean(result[2,]))
} else {
sta <- summary(result)
}
row <- c(colname, k, numberTests, sta)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
if (interpolate) {
names(df) <- c("Variable", "No.NA", "No.Reps", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "No.Reps", "Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max")
}
return(df)
}
#' Perform the mass elimination of consecutive daily values from a monthly data set
#'
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
#' @param simplify Boolean; whether to return simplified results
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'
dataEliminatorThorough <- function(month, csv, NAs, variables = c("max", "min", "mean"), simplify = FALSE, interpolate = FALSE) {
## This version of the function is for consecutive sampling, and will test all possible
## permutations of a month with `1` missing values
## First, I hold the users hand, making sure that we have all of the information we need
## to perform the test.
# Set up the data.frame `month`
if (missing(month) && missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
if (missing(month) && !missing(csv)) month <- read.csv(csv)
# Get the range of `k` and sampling type
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
# Find out which variables we are interested in
for (var in seq_along(variables)) {
variables[var] <- as.numeric(grep(variables[var], names(month), ignore.case = TRUE))
}
variables <- as.numeric(variables)
# We make an empty data frame where we will store our results
df2 <- df3 <- data.frame(stringsAsFactors = FALSE)
# Now we start an outer loop, for each value of `k`
for (k in NAs) {
df <- data.frame(stringsAsFactors = FALSE)
# Now we start an inner loop, which will loop through dates from the 1st to the (n+1-k)th
# Note, please ignore the fact that my loop function variables are different letters than above!
for (days in 1:(nrow(month)+1-k)) {
# Choose which days to eliminate
NAs <- seq(days, days-1+k)
# We do the nullifying
monthMod <- month
monthMod[NAs,] <- NA
# Now we start an inner loop that does all the calculations and builds the results table
for (var in variables) {
colname <- colnames(month[var])
month[[var]] <- as.numeric(month[[var]])
mean <- mean(month[[var]], na.rm = TRUE)
SD <- sd(month[[var]], na.rm = T)
meanMod <- mean(monthMod[[var]], na.rm = TRUE)
deltaMean <- abs(meanMod-mean)
prop <- deltaMean/SD
if (interpolate) {
monthApprox <- na.approx(monthMod[[var]])
prop <- c(prop, abs(mean(monthApprox, na.rm = TRUE)-mean)/SD)
}
# Then we stick it all together in a table.
row <- c(colname, k, NAs[1], NAs[length(NAs)], mean, SD, meanMod,deltaMean, prop)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
}
if (interpolate) {
names(df) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
df3 <- rbind(df3, df)
if (interpolate) {
names(df3) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df3) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
# Yet another loop (Optimization be damned!) that takes the summary stats for each variable, in
# case we want a simple summary.
for (var in variables) {
colname <- colnames(month[var])
index <- which(df$Variable == colname)
if (interpolate) {
sta <- c(mean(as.numeric(df$Mean.Error[index])), mean(as.numeric(df$Mean.Error.Approx[index])))
} else {
sta <- summary(as.numeric(df$Mean.Error[index]))
}
row <- c(colname, k, length(1:(nrow(month)+1-k)), sta)
df2 <- rbind(df2, row, stringsAsFactors = FALSE)
}
if (interpolate) {
names(df2) <- c("Variable", "No.NA", "No.Reps", "Mean.Error", "Mean.Error.Approx")
} else {
names(df2) <- c("Variable", "No.NA", "No.Reps", "Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max")
}
}
# If we chose the simplify option, we just get the summary results.
if (simplify) {
return(df2)
} else {
return(df3)
}
}
#' Calculate the average of all ensembles for the same Variable/Model/Scenario
#'
#' @param datin a \code{tbl_df} produced by gcm_anomalies().
#'
#' @importFrom dplyr group_by left_join select summarize summarize_all
#' @importFrom magrittr %>%
#' @export
ensemble_means <- function(datin) {
ensembles <- datin %>% group_by(Variable, Model, Scenario) %>% summarize(Ensembles = list(unique(Ensemble)))
summaries <- datin %>% select(-4) %>% group_by(Variable, Model, Scenario) %>% summarize_all(mean)
datout <- left_join(ensembles, summaries)
datout
}
#' Generate table GCM anomaly data
#'
#' @param dir character; path to the directory containing the \code{.nc} files you wish to process
#' @param files character; list of filenames to process. Any list item of >1 will be processed sequentially.
#' @param lat numeric; the latitude of the target cell
#' @param lon numeric; the longitude of the target cell
#' @param timeslice numeric; 1-16: annual (0), monthly (1-12), seasonal (13-16)
#' @param baseline the baseline on which the check the data (defaults to 1971:2000)
#' @param start projection start year (defaults to 2011)
#' @param simple_names Boolean; whether to simplify the projection interval names (ie. 2020s instead of 2011-2040)
#' @param calc_anom Boolean; whether to calculate anomaly values.
#' @param ensemble_average Boolean; whether to average different ensembles for the same variable/model/scenario.
#' @param cache_dir character; a directory to cache results (defaults to NULL, i.e. no cache.)
#'
#' @return a class \code{data.frame}
#' @export
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr arrange filter group_by summarize
#' @importFrom ncdf4 nc_close nc_open ncvar_get
#' @importFrom ncdf4.helpers get.split.filename.cmip5 nc.get.time.series nc.get.var.subset.by.axes
#' @importFrom storr storr_rds
#' @importFrom tibble as_tibble tibble
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom zoo as.yearmon
#'
#' @examples
#' \dontrun{annual <- gcm_anomalies(getwd(), -7.023833, -76.465222, 0, 1971:2000, calc_anom = TRUE)}
gcm_anomalies <- function(dir = getwd(), files, lat, lon, timeslice = 0, baseline, start = 2011, simple_names = FALSE, calc_anom = TRUE, ensemble_average = TRUE, cache_dir = NULL) {
if(!is.null(cache_dir)) {
use_storr = TRUE
sto <- storr_rds(cache_dir)
}
## Generate projection periods based on baseline length and start year
number_of_periods <- (2100 - start + 1) %/% length(baseline)
proj <- list()
period_names <- rep(NA, number_of_periods)
for (per in 1:number_of_periods) {
st <- start + (length(baseline) * (per - 1))
ed <- start + (length(baseline) * per) - 1
period <- list(st:ed)
period_names[per] <- if(simple_names) paste0(mean(unlist(period)) %/% 10 * 10, "s") else paste0(st, "-", ed)
proj <- c(proj, period)
}
## Prepare a table with the right dimensions
dat <- as_tibble(matrix(NA, nrow = length(files), ncol = 5 + number_of_periods))
# Process each NetCDF file
print("Processing the NetCDF files...")
## Set up a progress Bar
prog <- txtProgressBar(min = 0, max = length(files), style = 3)
on.exit(close(prog))
for (file_list in seq_along(files)) {
fs <- unlist(files[file_list])
for (f in 1:length(fs)) {
components <- get.split.filename.cmip5(file.path(dir, fs[f]))
var <- unname(components['var'])
model <- unname(components['model'])
scenario <- unname(components['emissions'])
ensemble <- unname(components['run'])
nc_nc <- nc_open(file.path(dir, fs[f]))
if (f == 1) {
# Get the bounds of each cell
lon_bnds <- try(ncvar_get(nc_nc, "lon_bnds"), silent = TRUE)
if (inherits(lon_bnds, "try-error")) {
lon_bnds <- ncvar_get(nc_nc, "lon_bounds")
}
lat_bnds <- try(ncvar_get(nc_nc, "lat_bnds"), silent = TRUE)
if (inherits(lat_bnds, "try-error")) {
lat_bnds <- ncvar_get(nc_nc, "lat_bounds")
}
# Convert Western longitudes to degrees East
lon <- ifelse(lon <0, 360 + lon, lon)
# Get the grid cell we are interested in
lat_index <- which(lat_bnds[1,] <= lat & lat_bnds[2,] >= lat)
lon_index <- which(lon_bnds[1,] <= lon & lon_bnds[2,] >= lon)
rm(lat_bnds, lon_bnds); gc()
}
# Check the cache
key <- paste(fs[f], paste(lat_index, collapse = "-"), paste(lon_index, collapse = "-"), sep = "_")
time_tmp <- NULL
if (use_storr) {
if (sto$exists(key)) {
print("Hit the cache")
time_tmp <- sto$get(key)
}
}
if (is.null(time_tmp)) {
print("Missed the cache")
# Get the time
nc_time <- nc.get.time.series(nc_nc, v = var, time.dim.name = "time")
nc_time <- as.yearmon(format(nc_time, format = "%Y-%m-%d hh:mm:ss"))
# Now load only that grid data
nc_var <- nc.get.var.subset.by.axes(nc_nc, var, axis.indices = list(X = lon_index, Y = lat_index))
# Close the nc connection
nc_close(nc_nc); rm(nc_nc)
if (dim(nc_var)[1] > 1 || dim(nc_var)[2] > 1) {
warning("We got two latitud or longitude cells. We'll average across them.")
nc_var <- apply(nc_var, 3, mean)
} else {
nc_var <- nc_var[1, 1, ]
}
if (var %in% c("tas", "tasmax", "tasmin")) {
nc_var <- nc_var - 273.15
} else {
stop("Sorry, for now I only work with temperature!")
}
time_tmp <- tibble(Time = nc_time, Year = format(as.yearmon(Time), format = "%Y"),
Month = format(as.yearmon(Time), format = "%m"),
Var = nc_var)
rm(nc_var, nc_time); gc()
if(use_storr) sto$set(key, time_tmp)
}
if (f == 1) {
time_series <- time_tmp
} else {
time_series <- suppressWarnings(bind_rows(time_series, time_tmp))
}
}
if (timeslice > 0 & timeslice < 13) {
time_series <- filter(time_series, Month == sprintf("%02d", timeslice))
} else if (timeslice > 12 & timeslice < 17) {
if (timeslice == 13) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(1, 2, 12)))
time_series$Year[time_series$Month == sprintf("%02d", 12)] <- as.numeric(time_series$Year[time_series$Month == sprintf("%02d", 12)]) + 1
}
if (timeslice == 14) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(3, 4, 5)))
}
if (timeslice == 15) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(6, 7, 8)))
}
if (timeslice == 16) {
time_series <- filter(time_series, Month %in% sprintf("%02d", c(9, 10, 11)))
}
}
result <- time_series %>% group_by(Year) %>% summarize(Mean = mean(Var))
result$Year <- as.numeric(result$Year)
periods <- rep(NA, length(proj))
if (scenario == "historical") {
baseln <- mean(result$Mean[result$Year >= min(baseline) & result$Year <= max(baseline)])
} else {
baseln <- NA
for (i in seq_along(proj)) {
periods[i] <- mean(result$Mean[result$Year >= min(proj[[i]]) & result$Year <= max(proj[[i]])])
}
}
col_names <- c("Variable", "Model", "Scenario", "Ensemble", paste0("Baseline ", min(baseline), "-", max(baseline)), period_names)
row <- list(var, model, scenario, ensemble, baseln)
row <- c(row, periods)
names(row) <- col_names
dat[file_list,] <- row
colnames(dat) <- col_names
setTxtProgressBar(prog, value = file_list)
}
# Compute additional metrics
if (calc_anom) dat <- calc_anoms(dat)
if (ensemble_average) dat <- ensemble_means(dat)
# Return our tibble
dat %>% arrange(Model, Scenario, Ensemble)
}
#' Generate a list of GCM files to pass to \code{gcm_anomalies()}
#'
#' @param dir character; the directory containing the files.
#' @param baseline numeric; a vector years that should be covered by history (\code{NULL} to ignore)
#' @param projection numeric; a vector years that should be covered by the models (\code{NULL} to ignore)
#' @param scenario character; the scenario to filter on (optional)
#' @param ensemble character; the ensemble to fulter on (optional)
#'
#' @return a list of filenames
#'
#' @importFrom tibble add_column as_tibble
#' @importFrom dplyr count filter
#' @importFrom magrittr %>%
#' @importFrom ncdf4.helpers get.split.filename.cmip5
#' @importFrom zoo as.yearmon
#'
#' @export
#'
#' @examples
#' \dontrun{get_gcm_files(getwd(), 1971:2000, scenario = c("rcp85", "rcp45", "rcp26", "rcp60"), ensemble = "r1i1p1")}
get_gcm_files <- function(dir, baseline = NULL, projection = NULL, scenario = NULL, ensemble = NULL) {
# If we want both the baseline and the projections, we'll run this twice, once for each period.
if (!is.null(baseline) && !is.null(projection)) {
baseline_files <- get_gcm_files(dir, baseline, NULL, scenario, ensemble)
projection_files <- get_gcm_files(dir, NULL, projection, scenario, ensemble)
return(append(baseline_files, projection_files))
}
if (is.null(baseline)) period <- projection
if (is.null(projection)) {
period <- baseline
# There are no experiments in the past.
scenario <- "historical"
}
# Get the available files
filenames <- dir(dir)
choices <- as_tibble(t(sapply(filenames, get.split.filename.cmip5)))
names(choices) <- c("Variable", "Period", "Model", "Scenario", "Ensemble", "Range", "Start", "End")
choices <- choices %>% add_column(Filenames = filenames)
# Narrow down our choices
filtered <- choices
if (!is.null(scenario)) filtered <- filtered %>% filter(Scenario %in% scenario)
if (!is.null(ensemble)) filtered <- filtered %>% filter(Ensemble %in% ensemble)
# A lot of useful model output is split into multiple files
multi_models <- filtered %>% group_by(Scenario, Ensemble) %>% count(Model) %>% filter(n > 1)
groupings <- rep(list(NA), nrow(multi_models))
for (row in 1:nrow(multi_models)){
tab <- filtered %>% filter(Model == multi_models$Model[row], Scenario == multi_models$Scenario[row], Ensemble == multi_models$Ensemble[row])
# If any file for this model gives us all the data we need, we'll skip this row
if (nrow(tab[tab$Start <= paste0(period[1], "01") & tab$End >= paste0(period[length(period)], "12"),]) > 0) {
next
}
# If the data starts too late or ends too soon, we'll skip this row
if (max(tab$End) < paste0(period[length(period)], "12")) {
next
}
if (min(tab$Start) > paste0(period[1], "01")) {
next
}
# If each file starts one year-month after the previous, we're happy!
for (file in seq((max(which(tab$Start <= paste0(period[1], "01"))) + 1), min(which(tab$End >= paste0(period[length(period)], "12"))))) {
if (as.yearmon(tab$Start[file], format = "%Y%m") == (as.yearmon(tab$End[file - 1], format = "%Y%m") + 1/12)) {
if (is.na(groupings[row])) {
groupings[row] <- list(c(tab$Filenames[file - 1], tab$Filenames[file]))
} else {
groupings[row] <- list(c(unlist(groupings[row]), tab$Filenames[file]))
}
} else {
groupings[row] <- NA
}
}
}
# Spit out the groupings, generated above, as well as any file that has all the data we need.
append(groupings[!is.na(groupings)], filtered$Filenames[filtered$Start <= paste0(period[1], "01") & filtered$End >= paste0(period[length(period)], "12")])
}