Commit 72c9ffe0 authored by Conor Anderson's avatar Conor Anderson

Use a smarter method of getting gcm filenames.

parent fd8ba308
......@@ -5,3 +5,4 @@ README.Rmd
README.md
LICENSE.md
install_claut.R
appveyor.yml
......@@ -22,6 +22,7 @@ Imports:
ncdf4.helpers,
readr,
stats,
storr,
tibble,
utils,
zoo
......
......@@ -8,8 +8,11 @@ export(deltaDTD)
export(ensemble_means)
export(gcm_anomalies)
export(generate_wkt_csv)
export(get_gcm_files)
export(parse_ASCII_grid)
export(trimData)
importFrom(dplyr,arrange)
importFrom(dplyr,count)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,group_indices)
......@@ -30,6 +33,8 @@ importFrom(readr,read_table)
importFrom(readr,write_csv)
importFrom(stats,aggregate)
importFrom(stats,sd)
importFrom(storr,storr_rds)
importFrom(tibble,add_column)
importFrom(tibble,as_tibble)
importFrom(tibble,has_name)
importFrom(tibble,tibble)
......
#' Generate table GCM anomaly data
#'
#' @param dir character; path to the directory containing the \code{.nc} files you wish to process
#' @param filters character; vector of filters to limit the \code{.nc} to be used.
#' @param files character; list of filenames to process. Any list item of >1 will be processed sequentially.
#' @param lat numeric; the latitude of the target cell
#' @param lon numeric; the longitude of the target cell
#' @param timeslice numeric; 1-16: annual (0), monthly (1-12), seasonal (13-16)
......@@ -10,14 +10,16 @@
#' @param simple_names Boolean; whether to simplify the projection interval names (ie. 2020s instead of 2011-2040)
#' @param calc_anom Boolean; whether to calculate anomaly values.
#' @param ensemble_average Boolean; whether to average different ensembles for the same variable/model/scenario.
#' @param cache_dir character; a directory to cache results (defaults to NULL, i.e. no cache.)
#'
#' @return a class \code{data.frame}
#' @export
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr filter group_by summarize
#' @importFrom dplyr arrange filter group_by summarize
#' @importFrom ncdf4 nc_close nc_open ncvar_get
#' @importFrom ncdf4.helpers get.split.filename.cmip5 nc.get.time.series nc.get.var.subset.by.axes
#' @importFrom storr storr_rds
#' @importFrom tibble as_tibble tibble
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom zoo as.yearmon
......@@ -25,7 +27,12 @@
#' @examples
#' \dontrun{annual <- gcm_anomalies(getwd(), -7.023833, -76.465222, 0, 1971:2000, calc_anom = TRUE)}
gcm_anomalies <- function(dir = getwd(), filters, lat, lon, timeslice = 0, baseline, start = 2011, simple_names = FALSE, calc_anom = TRUE, ensemble_average = TRUE) {
gcm_anomalies <- function(dir = getwd(), files, lat, lon, timeslice = 0, baseline, start = 2011, simple_names = FALSE, calc_anom = TRUE, ensemble_average = TRUE, cache_dir = NULL) {
if(!is.null(cache_dir)) {
use_storr = TRUE
sto <- storr_rds(cache_dir)
}
## Generate projection periods based on baseline length and start year
number_of_periods <- (2100 - start + 1) %/% length(baseline)
......@@ -39,60 +46,98 @@ gcm_anomalies <- function(dir = getwd(), filters, lat, lon, timeslice = 0, basel
proj <- c(proj, period)
}
## Get a list of NetCDF files in the directory
files_list <- grep("*.nc", dir(dir), value = TRUE)
if (!missing(filters)) {
files_list <- files_list[grep(paste(filters, collapse="|"), files_list)]
}
## Prepare a table with the right dimensions
dat <- as_tibble(matrix(NA, nrow = length(files_list), ncol = 5 + number_of_periods))
dat <- as_tibble(matrix(NA, nrow = length(files), ncol = 5 + number_of_periods))
# Process each NetCDF file
print("Processing the NetCDF files...")
## Set up a progress Bar
prog <- txtProgressBar(min = 0, max = length(files_list), style = 3)
prog <- txtProgressBar(min = 0, max = length(files), style = 3)
on.exit(close(prog))
for (nc_file in seq_along(files_list)) {
components <- get.split.filename.cmip5(files_list[nc_file])
var <- unname(components['var'])
model <- unname(components['model'])
scenario <- unname(components['emissions'])
ensemble <- unname(components['run'])
nc_nc <- nc_open(file.path(dir, files_list[nc_file]))
nc_time <- nc.get.time.series(nc_nc, v = var, time.dim.name = "time")
nc_time <- as.yearmon(format(nc_time, format = "%Y-%m-%d hh:mm:ss"))
nc_lat <- ncvar_get(nc_nc, varid = "lat")
nc_lon <- ncvar_get(nc_nc, varid = "lon")
# Convert Western longitudes to degrees East
lon <- ifelse(lon <0, 360 + lon, lon)
for (file_list in seq_along(files)) {
fs <- unlist(files[file_list])
for (f in 1:length(fs)) {
components <- get.split.filename.cmip5(file.path(dir, fs[f]))
var <- unname(components['var'])
model <- unname(components['model'])
scenario <- unname(components['emissions'])
ensemble <- unname(components['run'])
nc_nc <- nc_open(file.path(dir, fs[f]))
if (f == 1) {
# Get the bounds of each cell
lon_bnds <- try(ncvar_get(nc_nc, "lon_bnds"), silent = TRUE)
if (inherits(lon_bnds, "try-error")) {
lon_bnds <- ncvar_get(nc_nc, "lon_bounds")
}
lat_bnds <- try(ncvar_get(nc_nc, "lat_bnds"), silent = TRUE)
if (inherits(lat_bnds, "try-error")) {
lat_bnds <- ncvar_get(nc_nc, "lat_bounds")
}
# Convert Western longitudes to degrees East
lon <- ifelse(lon <0, 360 + lon, lon)
# Get the grid cell we are interested in
lat_index <- which(lat_bnds[1,] <= lat & lat_bnds[2,] >= lat)
lon_index <- which(lon_bnds[1,] <= lon & lon_bnds[2,] >= lon)
rm(lat_bnds, lon_bnds); gc()
}
# Get the grid cell we are interested in
lon_index <- which.min(abs(nc_lon - lon))
lat_index <- which.min(abs(nc_lat - lat))
# Check the cache
key <- paste(fs[f], paste(lat_index, collapse = "-"), paste(lon_index, collapse = "-"), sep = "_")
time_tmp <- NULL
# Now load only that grid data
nc_var <- nc.get.var.subset.by.axes(nc_nc, var, axis.indices = list(X = lon_index, Y = lat_index))
if (use_storr) {
if (sto$exists(key)) {
print("Hit the cache")
time_tmp <- sto$get(key)
}
}
# Close the nc connection
nc_close(nc_nc)
rm(nc_nc)
if (is.null(time_tmp)) {
print("Missed the cache")
# Get the time
nc_time <- nc.get.time.series(nc_nc, v = var, time.dim.name = "time")
nc_time <- as.yearmon(format(nc_time, format = "%Y-%m-%d hh:mm:ss"))
# Now load only that grid data
nc_var <- nc.get.var.subset.by.axes(nc_nc, var, axis.indices = list(X = lon_index, Y = lat_index))
# Close the nc connection
nc_close(nc_nc); rm(nc_nc)
if (dim(nc_var)[1] > 1 || dim(nc_var)[2] > 1) {
warning("We got two latitud or longitude cells. We'll average across them.")
nc_var <- apply(nc_var, 3, mean)
} else {
nc_var <- nc_var[1, 1, ]
}
if (var %in% c("tas", "tasmax", "tasmin")) {
nc_var <- nc_var - 273.15
} else {
stop("Sorry, for now I only work with temperature!")
}
time_tmp <- tibble(Time = nc_time, Year = format(as.yearmon(Time), format = "%Y"),
Month = format(as.yearmon(Time), format = "%m"),
Var = nc_var)
rm(nc_var, nc_time); gc()
if(use_storr) sto$set(key, time_tmp)
}
if (var %in% c("tas", "tasmax", "tasmin")) {
var_data <- nc_var[1, 1, ] - 273.15
} else {
stop("Sorry, for now I only work with temperature!")
if (f == 1) {
time_series <- time_tmp
} else {
time_series <- suppressWarnings(bind_rows(time_series, time_tmp))
}
}
time_series <- tibble(Time = nc_time, Year = format(as.yearmon(Time), format = "%Y"),
Month = format(as.yearmon(Time), format = "%m"),
Var = var_data)
if (timeslice > 0 & timeslice < 13) {
time_series <- filter(time_series, Month == sprintf("%02d", timeslice))
} else if (timeslice > 12 & timeslice < 17) {
......@@ -130,13 +175,10 @@ gcm_anomalies <- function(dir = getwd(), filters, lat, lon, timeslice = 0, basel
names(row) <- col_names
dat[nc_file,] <- row
dat[file_list,] <- row
colnames(dat) <- col_names
rm(nc_var, nc_time, nc_lat, nc_lon)
gc()
setTxtProgressBar(prog, value = nc_file)
setTxtProgressBar(prog, value = file_list)
}
# Compute additional metrics
......@@ -144,5 +186,5 @@ gcm_anomalies <- function(dir = getwd(), filters, lat, lon, timeslice = 0, basel
if (ensemble_average) dat <- ensemble_means(dat)
# Return our tibble
dat
dat %>% arrange(Model, Scenario, Ensemble)
}
#' Generate a list of GCM files to pass to \code{gcm_anomalies()}
#'
#' @param dir character; the directory containing the files.
#' @param baseline numeric; a vector years that should be covered by history (\code{NULL} to ignore)
#' @param projection numeric; a vector years that should be covered by the models (\code{NULL} to ignore)
#' @param scenario character; the scenario to filter on (optional)
#' @param ensemble character; the ensemble to fulter on (optional)
#'
#' @return a list of filenames
#'
#' @importFrom tibble add_column as_tibble
#' @importFrom dplyr count filter
#' @importFrom magrittr %>%
#' @importFrom ncdf4.helpers get.split.filename.cmip5
#' @importFrom zoo as.yearmon
#'
#' @export
#'
#' @examples
#' \dontrun{get_gcm_files(getwd(), 1971:2000, scenario = c("rcp85", "rcp45", "rcp26", "rcp60"), ensemble = "r1i1p1")}
get_gcm_files <- function(dir, baseline = NULL, projection = NULL, scenario = NULL, ensemble = NULL) {
# If we want both the baseline and the projections, we'll run this twice, once for each period.
if (!is.null(baseline) && !is.null(projection)) {
baseline_files <- get_gcm_files(dir, baseline, NULL, scenario, ensemble)
projection_files <- get_gcm_files(dir, NULL, projection, scenario, ensemble)
return(append(baseline_files, projection_files))
}
if (is.null(baseline)) period <- projection
if (is.null(projection)) {
period <- baseline
# There are no experiments in the past.
scenario <- "historical"
}
# Get the available files
filenames <- dir(dir)
choices <- as_tibble(t(sapply(filenames, get.split.filename.cmip5)))
names(choices) <- c("Variable", "Period", "Model", "Scenario", "Ensemble", "Range", "Start", "End")
choices <- choices %>% add_column(Filenames = filenames)
# Narrow down our choices
filtered <- choices
if (!is.null(scenario)) filtered <- filtered %>% filter(Scenario %in% scenario)
if (!is.null(ensemble)) filtered <- filtered %>% filter(Ensemble %in% ensemble)
# A lot of useful model output is split into multiple files
multi_models <- filtered %>% group_by(Scenario, Ensemble) %>% count(Model) %>% filter(n > 1)
groupings <- rep(list(NA), nrow(multi_models))
for (row in 1:nrow(multi_models)){
tab <- filtered %>% filter(Model == multi_models$Model[row], Scenario == multi_models$Scenario[row], Ensemble == multi_models$Ensemble[row])
# If any file for this model gives us all the data we need, we'll skip this row
if (nrow(tab[tab$Start <= paste0(period[1], "01") & tab$End >= paste0(period[length(period)], "12"),]) > 0) {
next
}
# If the data starts too late or ends too soon, we'll skip this row
if (max(tab$End) < paste0(period[length(period)], "12")) {
next
}
if (min(tab$Start) > paste0(period[1], "01")) {
next
}
# If each file starts one year-month after the previous, we're happy!
for (file in seq((max(which(tab$Start <= paste0(period[1], "01"))) + 1), min(which(tab$End >= paste0(period[length(period)], "12"))))) {
if (as.yearmon(tab$Start[file], format = "%Y%m") == (as.yearmon(tab$End[file - 1], format = "%Y%m") + 1/12)) {
if (is.na(groupings[row])) {
groupings[row] <- list(c(tab$Filenames[file - 1], tab$Filenames[file]))
} else {
groupings[row] <- list(c(unlist(groupings[row]), tab$Filenames[file]))
}
} else {
groupings[row] <- NA
}
}
}
# Spit out the groupings, generated above, as well as any file that has all the data we need.
append(groupings[!is.na(groupings)], filtered$Filenames[filtered$Start <= paste0(period[1], "01") & filtered$End >= paste0(period[length(period)], "12")])
}
......@@ -4,14 +4,14 @@
\alias{gcm_anomalies}
\title{Generate table GCM anomaly data}
\usage{
gcm_anomalies(dir = getwd(), filters, lat, lon, timeslice = 0, baseline,
gcm_anomalies(dir = getwd(), files, lat, lon, timeslice = 0, baseline,
start = 2011, simple_names = FALSE, calc_anom = TRUE,
ensemble_average = TRUE)
ensemble_average = TRUE, cache_dir = NULL)
}
\arguments{
\item{dir}{character; path to the directory containing the \code{.nc} files you wish to process}
\item{filters}{character; vector of filters to limit the \code{.nc} to be used.}
\item{files}{character; list of filenames to process. Any list item of >1 will be processed sequentially.}
\item{lat}{numeric; the latitude of the target cell}
......@@ -28,6 +28,8 @@ gcm_anomalies(dir = getwd(), filters, lat, lon, timeslice = 0, baseline,
\item{calc_anom}{Boolean; whether to calculate anomaly values.}
\item{ensemble_average}{Boolean; whether to average different ensembles for the same variable/model/scenario.}
\item{cache_dir}{character; a directory to cache results (defaults to NULL, i.e. no cache.)}
}
\value{
a class \code{data.frame}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_gcm_files.R
\name{get_gcm_files}
\alias{get_gcm_files}
\title{Generate a list of GCM files to pass to \code{gcm_anomalies()}}
\usage{
get_gcm_files(dir, baseline = NULL, projection = NULL, scenario = NULL,
ensemble = NULL)
}
\arguments{
\item{dir}{character; the directory containing the files.}
\item{baseline}{numeric; a vector years that should be covered by history (\code{NULL} to ignore)}
\item{projection}{numeric; a vector years that should be covered by the models (\code{NULL} to ignore)}
\item{scenario}{character; the scenario to filter on (optional)}
\item{ensemble}{character; the ensemble to fulter on (optional)}
}
\value{
a list of filenames
}
\description{
Generate a list of GCM files to pass to \code{gcm_anomalies()}
}
\examples{
\dontrun{get_gcm_files(getwd(), 1971:2000, scenario = c("rcp85", "rcp45", "rcp26", "rcp60"), ensemble = "r1i1p1")}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment