First public commit

parent c5db41c8
Pipeline #7162308 passed with stage
in 57 seconds
^.*\.Rproj$
^\.Rproj\.user$
^\.gitlab-ci\.yml$
README.Rmd
README.md
LICENSE.md
......@@ -2,3 +2,4 @@
.Rhistory
.RData
.Ruserdata
claut.Rproj
......@@ -25,4 +25,4 @@ URL: https://gitlab.com/ConorIA/claut
BugReports: https://gitlab.com/ConorIA/claut/issues
Encoding: UTF-8
LazyData: true
RoxygenNote: 5.0.1
RoxygenNote: 6.0.1
This diff is collapsed.
# Generated by roxygen2: do not edit by hand
export(deltaDTD)
export(dataEliminator)
export(dataEliminatorMassive)
export(dataEliminatorThorough)
export(trimData)
importFrom(stats,aggregate)
importFrom(stats,sd)
importFrom(zoo,as.yearmon)
importFrom(zoo,as.yearqtr)
importFrom(utils,read.csv)
importFrom(zoo,na.approx)
#' Eliminate daily values from a monthly data set
#'
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
#' @param simplify Boolean; whether to return simplified results
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'
dataEliminator <- function(month, csv, NAs, sampling = "z", variables = c("max", "min", "mean"), simplify = FALSE, interpolate = FALSE) {
## First, I hold the users hand, making sure that we have all of the information we need to perform the test.
# Set up the data.frame `month`
if (missing(month)) {
if (missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
month <- read.csv(csv)
}
# Get the range of `k` and sampling type
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
while (sampling != "c" & sampling != "r") {
sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
}
for (var in seq_along(variables)) {
variables[var] <- as.numeric(grep(variables[var], names(month), ignore.case = TRUE))
}
variables <- as.numeric(variables)
# We make an empty data frame where we will store our results
df <- data.frame(stringsAsFactors = FALSE)
# Now we start an outer loop, for each value of `k`
for (k in NAs) {
# First we generate our random data points that will be nullified
if (sampling == "r") {
NAvec <- sample(1:nrow(month), size = k, replace = FALSE)
}
if (sampling == "c") {
NAvec <- sample(1:(nrow(month)+1-k), size = 1, replace = FALSE)
NAvec <- seq(NAvec, NAvec-1+k)
}
# We do the nullifying
monthMod <- month
monthMod[NAvec,] <- NA
# Now we start an inner loop that does all the calculations and builds the results table
for (var in variables) {
colname <- colnames(month[var])
mean <- mean(month[[var]], na.rm = T)
SD <- sd(month[[var]], na.rm = T)
meanMod <- mean(monthMod[[var]], na.rm = T)
deltaMean <- abs(meanMod-mean)
prop <- deltaMean/SD
if (interpolate) {
monthApprox <- na.approx(monthMod[[var]])
prop <- c(prop, abs(mean(monthApprox, na.rm = T)-mean)/SD)
}
# Then we stick it all together in a table.
row <- c(colname, k, mean, SD, meanMod,deltaMean, prop)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
}
if (interpolate) {
names(prop) <- c("Mean.Error", "Mean.Error.Approx")
names(df) <- c("Variable", "No.NA", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
# If we chose the simplify option, we just get the binary result. This is useful for running the test a massive number of times, but only for a single variable. See the appendix!
if (simplify) {
return(prop)
} else {
return(df)
}
}
#' Perform the mass elimination of daily values from a monthly data set
#'
#' @param numberTests integer; the number of times to repeat the test
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variable character; the name of the variable to test (we will try to auto-idenify the column number)
#' @param verbose Boolean; whether the function should be verbose
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom utils read.csv
#'
#' @export
#'
dataEliminatorMassive <- function(numberTests, month, csv, NAs, sampling = "z", variable = c("max", "min", "mean"), verbose = FALSE, interpolate = FALSE) {
#source("dataEliminator.R")
# We again hold the users hand to come up with all the info we need
if (missing(numberTests)) numberTests <- as.numeric(readline(prompt = "How many times should I repeat the test? "))
if (missing(month) && missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
if (missing(month) && !missing(csv)) month <- read.csv(csv)
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
while (sampling != "c" & sampling != "r") {
sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
}
while (length(variable) > 1 | (variable != "mean" && variable != "max" && variable != "min")) {
print("Please choose the variable to test.")
variable <- readline(prompt = "Enter one of \"max\", \"min\", or \"mean\". ")
}
# We create an empty data frame to put our results
df <- data.frame(stringsAsFactors = FALSE)
colname <- colnames(month[grep(variable, names(month), ignore.case = TRUE)])
# We loop through the values of `k`
for (k in NAs) {
if (verbose == TRUE) print(paste("Running", numberTests, "tests with", k, "NAs."))
# We now run the original dataEliminator script, `numberTests` times, using the `simplify argument`.
result <- replicate(numberTests, dataEliminator(month = month, NAs = k, sampling = sampling, variables = variable, simplify = TRUE, interpolate = interpolate))
# Then we make a table of the results, proportion of tests passed overall, and broken down by test type.
if (interpolate) {
sta <- c(mean(result[1,]), mean(result[2,]))
} else {
sta <- summary(result)
}
row <- c(colname, k, numberTests, sta)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
if (interpolate) {
names(df) <- c("Variable", "No.NA", "No.Reps", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "No.Reps", "Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max")
}
return(df)
}
#' Perform the mass elimination of consecutive daily values from a monthly data set
#'
#' @param month data.frame; a data.frame with a single year-month of data with no missing values
#' @param csv character; path to a .csv file containing a single year-month of data with no missing values
#' @param NAs numeric; a vector of the number of NA values to test
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
#' @param simplify Boolean; whether to return simplified results
#' @param interpolate Boolean; whether to use linear interpolation to approximate the missing values
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'
dataEliminatorThorough <- function(month, csv, NAs, variables = c("max", "min", "mean"), simplify = FALSE, interpolate = FALSE) {
## This version of the function is for consecutive sampling, and will test all possible
## permutations of a month with `1` missing values
## First, I hold the users hand, making sure that we have all of the information we need
## to perform the test.
# Set up the data.frame `month`
if (missing(month) && missing(csv)) csv <- readline(prompt = "Please specify csv file. ")
if (missing(month) && !missing(csv)) month <- read.csv(csv)
# Get the range of `k` and sampling type
if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
# Find out which variables we are interested in
for (var in seq_along(variables)) {
variables[var] <- as.numeric(grep(variables[var], names(month), ignore.case = TRUE))
}
variables <- as.numeric(variables)
# We make an empty data frame where we will store our results
df2 <- df3 <- data.frame(stringsAsFactors = FALSE)
# Now we start an outer loop, for each value of `k`
for (k in NAs) {
df <- data.frame(stringsAsFactors = FALSE)
# Now we start an inner loop, which will loop through dates from the 1st to the (n+1-k)th
# Note, please ignore the fact that my loop function variables are different letters than above!
for (days in 1:(nrow(month)+1-k)) {
# Choose which days to eliminate
NAs <- seq(days, days-1+k)
# We do the nullifying
monthMod <- month
monthMod[NAs,] <- NA
# Now we start an inner loop that does all the calculations and builds the results table
for (var in variables) {
colname <- colnames(month[var])
month[[var]] <- as.numeric(month[[var]])
mean <- mean(month[[var]], na.rm = TRUE)
SD <- sd(month[[var]], na.rm = T)
meanMod <- mean(monthMod[[var]], na.rm = TRUE)
deltaMean <- abs(meanMod-mean)
prop <- deltaMean/SD
if (interpolate) {
monthApprox <- na.approx(monthMod[[var]])
prop <- c(prop, abs(mean(monthApprox, na.rm = TRUE)-mean)/SD)
}
# Then we stick it all together in a table.
row <- c(colname, k, NAs[1], NAs[length(NAs)], mean, SD, meanMod,deltaMean, prop)
df <- rbind(df, row, stringsAsFactors = FALSE)
}
}
if (interpolate) {
names(df) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
df3 <- rbind(df3, df)
if (interpolate) {
names(df3) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error", "Mean.Error.Approx")
} else {
names(df3) <- c("Variable", "No.NA", "StartDate", "EndDate", "Mean", "StDev", "Mod.Mean", "Abs.Diff", "Mean.Error")
}
# Yet another loop (Optimization be damned!) that takes the summary stats for each variable, in
# case we want a simple summary.
for (var in variables) {
colname <- colnames(month[var])
index <- which(df$Variable == colname)
if (interpolate) {
sta <- c(mean(as.numeric(df$Mean.Error[index])), mean(as.numeric(df$Mean.Error.Approx[index])))
} else {
sta <- summary(as.numeric(df$Mean.Error[index]))
}
row <- c(colname, k, length(1:(nrow(month)+1-k)), sta)
df2 <- rbind(df2, row, stringsAsFactors = FALSE)
}
if (interpolate) {
names(df2) <- c("Variable", "No.NA", "No.Reps", "Mean.Error", "Mean.Error.Approx")
} else {
names(df2) <- c("Variable", "No.NA", "No.Reps", "Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max")
}
}
# If we chose the simplify option, we just get the summary results.
if (simplify) {
return(df2)
} else {
return(df3)
}
}
##' @title Calculate deltaDTD on any time scale
##'
##' @description "A function that calculates DTD, SD, G, and deltaDTD on daily, monthly, seasonal, or annual time scales. It automatically eliminates aggregate values with higher than 20\% missing values. This function requres a data frame with dates in column 1 called 'Date', Tmax in column 2 called 'MaxTemp', Tmin in column 3 called 'MinTemp'. No other data is necessary Note that the data aquired though the \code{canadaHCD} package meets these needs."
##'
##' @param data data.frame; define the data that should be analyzed
##' @param period character; The period for the output data. One of "daily", "monthly", "seasonal", or "annual".
##' @param QA logical; whether to filter those values that are missing 20\% of observations
##'
##' @author Conor I. Anderson
##'
##' @importFrom zoo as.yearmon as.yearqtr
##' @importFrom stats aggregate sd
##'
##' @export
##'
##' @examples
##' \dontrun{deltaDTD(tor_dly, "annual")}
deltaDTD <- function(data, period = "z", QA = TRUE) {
if(period != "annual" & period != "seasonal" & period != "monthly" & period != "daily") {
stop("I don't recognize the period you specified.")
}
# Make sure the Year, Month, and Day are at the start of the table.
data$Date <- as.Date(data$Date)
Year <- format(data$Date, format = "%Y")
Month <- format(data$Date, format = "%m")
Day <- format(data$Date, format = "%d")
data <- cbind(Year, Month, Day, data)
# Make a copy of the data to modify
dat <- data
# Calculate the DTD value.
dat$DTD_Tmax <- dat$MaxTemp - c(NA, dat$MaxTemp[1:(nrow(dat)-1)])
dat$DTD_Tmin <- dat$MinTemp - c(NA, dat$MinTemp[1:(nrow(dat)-1)])
# Calculate deltaDTD
dat$deltaDTD <- dat$DTD_Tmax-dat$DTD_Tmin
# Drop the data we don't need form the table.
dat <- cbind(dat[,1:6], dat[,(ncol(dat)-2):ncol(dat)])
## Stop here for daily data
if(period == "daily") return(dat)
if(period == "monthly"){
if(QA) {
# Count NA values for each variable
dat$Tmax_test <- is.na(dat$MaxTemp)
dat$Tmin_test <- is.na(dat$MinTemp)
dat$period_test <- rep(1, nrow(dat))
# Calculate number of missing values
Tmax_test <- stats::aggregate(Tmax_test ~ Month + Year , dat , sum, drop = FALSE)
Tmin_test <- stats::aggregate(Tmin_test ~ Month + Year , dat , sum, drop = FALSE)
# Total number of days in each month
period_test <- stats::aggregate(period_test ~ Month + Year , dat , sum, drop = FALSE)
# Take note of those months that are missing more than 20% of the data
Tmax_test <- Tmax_test$Tmax_test > (0.2 * period_test$period_test)
Tmin_test <- Tmin_test$Tmin_test > (0.2 * period_test$period_test)
}
# Agregate dat to monthly values.
Tmax <- stats::aggregate(MaxTemp ~ Month + Year , dat , mean, drop = FALSE)
Tmin <- stats::aggregate(MinTemp ~ Month + Year , dat , mean, drop = FALSE)
DTD_Tmax <- stats::aggregate(DTD_Tmax ~ Month + Year , dat , mean, drop = FALSE)
DTD_Tmin <- stats::aggregate(DTD_Tmin ~ Month + Year , dat , mean, drop = FALSE)
# Calculate Standard Deviation
SD_Tmax <- stats::aggregate(MaxTemp ~ Month + Year , dat , stats::sd, drop = FALSE)
SD_Tmin <- stats::aggregate(MinTemp ~ Month + Year , dat , stats::sd, drop = FALSE)
# Now we stick everything together in a new data frame.
dat <- Tmax
dat$MinTemp <- Tmin$MinTemp
dat$DTD_Tmax <- DTD_Tmax$DTD_Tmax
dat$DTD_Tmin <- DTD_Tmin$DTD_Tmin
dat$SD_Tmax <- SD_Tmax$MaxTemp
dat$SD_Tmin <- SD_Tmin$MinTemp
if(QA){
# Wipe out the months with too much missing data.
dat[Tmax_test,c(3,5,7)] <- NA
dat[Tmin_test,c(4,6,8)] <- NA
}
# Calculate G value
dat$G_Tmax <- dat$DTD_Tmax/dat$SD_Tmax
dat$G_Tmin <- dat$DTD_Tmin/dat$SD_Tmin
# Calculate deltaDTD
dat$deltaDTD <- dat$DTD_Tmax-dat$DTD_Tmin
# Add Yr-Mon and organize the table
dat$Yr.Mon <- zoo::as.yearmon(paste(dat$Year, dat$Month, sep = "-"))
dat <- cbind(dat[2],dat[1],dat[ncol(dat)],dat[3:(ncol(dat)-1)])
## Stop here for monthly data
return(dat)
} else {
## Take this route for seasonal data
if (period == "seasonal") {
## Shift Decembers up one year (but first, convert factors to numbers)
dat$Month <- as.numeric(as.character(dat$Month))
dat$Year <- as.numeric(as.character(dat$Year))
dat$Year[dat$Month == 12] <- dat$Year[dat$Month == 12] + 1
## Convert Months to Seasons
dat$Month[dat$Month == 1 | dat$Month == 2 | dat$Month == 12] <- 1
dat$Month[dat$Month == 3 | dat$Month == 4 | dat$Month == 5] <- 2
dat$Month[dat$Month == 6 | dat$Month == 7 | dat$Month == 8] <- 3
dat$Month[dat$Month == 9 | dat$Month == 10 | dat$Month == 11] <- 4
names(dat)[2] <- "Season"
if(QA) {
# Count NA values for each variable
dat$Tmax_test <- is.na(dat$MaxTemp)
dat$Tmin_test <- is.na(dat$MinTemp)
dat$period_test <- rep(1, nrow(dat))
# Calculate number of missing values
Tmax_test <- stats::aggregate(Tmax_test ~ Season + Year, dat , sum, drop = FALSE)
Tmin_test <- stats::aggregate(Tmin_test ~ Season + Year, dat , sum, drop = FALSE)
# Total number of days in each season
period_test <- stats::aggregate(period_test ~ Season + Year, dat , sum, drop = FALSE)
# Take note of those seasons that are missing 20% of data
Tmax_test <- Tmax_test$Tmax_test > (0.2 * period_test$period_test)
Tmin_test <- Tmin_test$Tmin_test > (0.2 * period_test$period_test)
# Take note of any seasons that have less than 90 days e.g. first and last
period_test <- period_test$period_test < 90
}
# Agregate dat to seasonal values.
Tmax <- stats::aggregate(MaxTemp ~ Season + Year, dat , mean, drop = FALSE)
Tmin <- stats::aggregate(MinTemp ~ Season + Year, dat , mean, drop = FALSE)
DTD_Tmax <- stats::aggregate(DTD_Tmax ~ Season + Year, dat , mean, drop = FALSE)
DTD_Tmin <- stats::aggregate(DTD_Tmin ~ Season + Year, dat , mean, drop = FALSE)
SD_Tmax <- stats::aggregate(MaxTemp ~ Season + Year, dat , stats::sd, drop = FALSE)
SD_Tmin <- stats::aggregate(MinTemp ~ Season + Year, dat , stats::sd, drop = FALSE)
# Now we stick everything together in a new data frame.
dat <- Tmax
dat$MinTemp <- Tmin$MinTemp
dat$DTD_Tmax <- DTD_Tmax$DTD_Tmax
dat$DTD_Tmin <- DTD_Tmin$DTD_Tmin
dat$SD_Tmax <- SD_Tmax$MaxTemp
dat$SD_Tmin <- SD_Tmin$MinTemp
if(QA){
# Trim tests if they overshoot the stats::aggregate data
if(length(Tmax_test) > nrow(dat)) Tmax_test <- Tmax_test[1:nrow(dat)]
if(length(Tmin_test) > nrow(dat)) Tmin_test <- Tmin_test[1:nrow(dat)]
if(length(period_test) > nrow(dat)) period_test <- period_test[1:nrow(dat)]
# Wipe out the seasons with too much missing data.
dat[Tmax_test,c(3,5,7)] <- NA
dat[Tmin_test,c(4,6,8)] <- NA
# Trim incomplete months at start and end of data set
dat[period_test,3:8] <- NA
# Get rid of extra year on the end if it got added
if (max(as.numeric(as.character(dat$Year))) >
max(as.numeric(as.character(data$Year)))) {
dat <- dat[-which(as.numeric(as.character(dat$Year)) >
max(as.numeric(as.character(data$Year)))),]
}
}
# Calculate G value
dat$G_Tmax <- dat$DTD_Tmax/dat$SD_Tmax
dat$G_Tmin <- dat$DTD_Tmin/dat$SD_Tmin
# Calculate deltaDTD
dat$deltaDTD <- dat$DTD_Tmax-dat$DTD_Tmin
# Add Yr-Season, and sort the table
dat$Yr.S <- zoo::as.yearqtr(paste(dat$Year, dat$Season, sep = "-"))
dat <- cbind(dat[2],dat[1],dat[ncol(dat)],dat[3:(ncol(dat)-1)])
## Stop here for seasonal data
return(dat)
} else {
## Take this route for annual data
if(QA) {
# Count NA values for each variable
dat$Tmax_test <- is.na(dat$MaxTemp)
dat$Tmin_test <- is.na(dat$MinTemp)
dat$period_test <- rep(1, nrow(dat))
# Calculate number of missing values
Tmax_test <- stats::aggregate(Tmax_test ~ Year , dat , sum, drop = FALSE)
Tmin_test <- stats::aggregate(Tmin_test ~ Year , dat , sum, drop = FALSE)
# Total number of days in each year
period_test <- stats::aggregate(period_test ~ Year , dat , sum, drop = FALSE)
# Take note of those years that are missing 20% of data
Tmax_test <- Tmax_test$Tmax_test > (0.2 * period_test$period_test)
Tmin_test <- Tmin_test$Tmin_test > (0.2 * period_test$period_test)
}
# Agregate dat to annual values.
Tmax <- stats::aggregate(MaxTemp ~ Year , dat , mean, drop = FALSE)
Tmin <- stats::aggregate(MinTemp ~ Year , dat , mean, drop = FALSE)
DTD_Tmax <- stats::aggregate(DTD_Tmax ~ Year , dat , mean, drop = FALSE)
DTD_Tmin <- stats::aggregate(DTD_Tmin ~ Year , dat , mean, drop = FALSE)
# Calculate standard deviation
SD_Tmax <- stats::aggregate(MaxTemp ~ Year , dat , stats::sd, drop = FALSE)
SD_Tmin <- stats::aggregate(MinTemp ~ Year , dat , stats::sd, drop = FALSE)
# Now we stick everything together in a new data frame.
dat <- Tmax
dat$MinTemp <- Tmin$MinTemp
dat$DTD_Tmax <- DTD_Tmax$DTD_Tmax
dat$DTD_Tmin <- DTD_Tmin$DTD_Tmin
dat$SD_Tmax <- SD_Tmax$MaxTemp
dat$SD_Tmin <- SD_Tmin$MinTemp
if(QA){
# Trim cells with >20% missing values
dat[Tmax_test,c(2,4,6)] <- NA
dat[Tmin_test,c(3,5,7)] <- NA
}
# Calculate G value
dat$G_Tmax <- dat$DTD_Tmax/dat$SD_Tmax
dat$G_Tmin <- dat$DTD_Tmin/dat$SD_Tmin
# Calculate deltaDTD
dat$deltaDTD <- dat$DTD_Tmax-dat$DTD_Tmin
## Stop here for annual data
return(dat)
}
}
}
......@@ -2,9 +2,10 @@
##'
##' @description Trim a daily data set to a desired start and end year. Originally written for data obtained using the \code{canadaHCD} package.
##'
##' @param data data.frame; define the data that should be trimmed
##' @param datain data.frame; define the data that should be trimmed
##' @param start numerical; the first year to include
##' @param end numerical; the last year to include
##' @param column integer or character; the number or the name of the date column
##'
##' @author Conor I. Anderson
##'
......@@ -13,25 +14,31 @@
##' @examples
##' \dontrun{trimData(tor_dly, 1991, 2010)}
trimData <- function(data, start, end) {
trimData <- function(datain, start, end, column = "Date") {
dat <- data
if (inherits(column, "integer")) {
col <- column
} else if (inherits(column, "character")) {
col <- which(names(datain) == column)
} else {
stop("We couldn't identify the right column.")
}
dat <- datain
# Make sure the Day, Month, and Year are at the start of the table.
dat$Date <- as.Date(dat$Date)
Year <- format(dat$Date, format = "%Y")
dat <- cbind(Year, dat)
dat[[col]] <- as.Date(dat[[col]])
start <- grep(start, dat$Year)
start <- which(format(dat[[col]], format = "%Y") == start)
if (length(start) > 0) {
start <- min(start)
} else start <- 1
end <- grep(end, dat$Year)
end <- which(format(dat[[col]], format = "%Y") == end)
if (length(end) > 0) {
end <- max(end)
} else end <- nrow(dat)
if (start > 1 | end < nrow(dat)) {
dat <- subset(dat[start:end,2:ncol(dat)])
dat <- subset(dat[start:end,])
return(dat)
} else return(data)
} else return(datain)
}
---
title: "claut"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Functions from the University of Toronto Climate Lab
This package, currently at a very early stage of development, will eventually host many of the functions generated at the University of Toronto Climate Lab. For now, I (Conor Anderson) am the sole maintainer of, and contributor to, this package, however, I hope that eventually all of the miscellaneous functions that are produced in the lab can find their way into this package. Any function contained in this package is documented. Type `?function_name` in R to access this documentation.
## Functions used in papers in review
The following functions were used in studies that are currently under review. If you are a reviewer, this is probably what you are looking for. Note that these functions may undergo some minor optimization or code changes, but the results that they produce will always be the same.
- dataEliminator: This set of functions is used to artificually introduce missing values into monthly climate series. In its current state, there are three variations of the function. These are likely to be refactored.
1. [`dataEliminator()`](https://gitlab.com/ConorIA/claut/blob/master/R/dataEliminator.R): The base function that eliminates data for a single year-month of data
2. [`dataEliminatorMassive()`](https://gitlab.com/ConorIA/claut/blob/master/R/dataEliminatorMassive.R): A helper function that calls the base function multiple times, e.g. 1000 repetitions of the same test
3. [`dataEliminatorThorough()`](https://gitlab.com/ConorIA/claut/blob/master/R/dataEliminatorThorough.R): A modified version of the base function that performs consecutive elimination of all possible combinations of $k$ consecutive missing values
## Misc functions
There are some other helper functions in this package that are here in hopes that they prove useful to someone someday. These are:
- [`trim_data()`](https://gitlab.com/ConorIA/claut/blob/master/R/trimData.R): An easy function to trim a `data.frame` to given start and end years
## Other packages
Members of the lab also use other packages that are too big to fit into this package. These are:
### Package we maintain:
- [senamhiR: A Collection of Functions to Obtain Peruvian Climate Data](https://gitlab.com/ConorIA/senamhiR/)
- [canadaHCDx: Additional functions for the canadaHCD package](https://gitlab.com/ConorIA/canadaHCDx/)
### Packages we use:
- [canadaHCD: Canadian Historical Climate Data](https://github.com/gavinsimpson/canadaHCD/) by [Gavin L. Simpson](https://github.com/gavinsimpson)
## Contributing
The members of the Climate Lab are, first and foremost, academics and researchers, not coders. If you would like to contribute code improvements or patches to any of the functions here, please feel free to open a [merge request](https://gitlab.com/ConorIA/claut/merge_requests). Please note that code you contribute will be attributed to you, and will be released under the terms of the [GNU GPLv3](https://gitlab.com/ConorIA/claut/blob/master/LICENSE.md).
claut
================
Functions from the University of Toronto Climate Lab
----------------------------------------------------
This package, currently at a very early stage of development, will eventually host many of the functions generated at the University of Toronto Climate Lab. For now, I (Conor Anderson) am the sole maintainer of, and contributor to, this package, however, I hope that eventually all of the miscellaneous functions that are produced in the lab can find their way into this package. Any function contained in this package is documented. Type `?function_name` in R to access this documentation.
Functions used in papers in review
----------------------------------
The following functions were used in studies that are currently under review. If you are a reviewer, this is probably what you are looking for. Note that these functions may undergo some minor optimization or code changes, but the results that they produce will always be the same.
- dataEliminator: This set of functions is used to artificually introduce missing values into monthly climate series. In its current state, there are three variations of the function. These are likely to be refactored.
1. [`dataEliminator()`](https://gitlab.com/ConorIA/claut/blob/master/R/dataEliminator.R): The base function that eliminates data for a single year-month of data
2. [`dataEliminatorMassive()`](https://gitlab.com/ConorIA/claut/blob/master/R/dataEliminatorMassive.R): A helper function that calls the base function multiple times, e.g. 1000 repetitions of the same test
3. [`dataEliminatorThorough()`](https://gitlab.com/ConorIA/claut/blob/master/R/dataEliminatorThorough.R): A modified version of the base function that performs consecutive elimination of all possible combinations of *k* consecutive missing values
Misc functions
--------------
There are some other helper functions in this package that are here in hopes that they prove useful to someone someday. These are:
- [`trim_data()`](https://gitlab.com/ConorIA/claut/blob/master/R/trimData.R): An easy function to trim a `data.frame` to given start and end years
Other packages
--------------
Members of the lab also use other packages that are too big to fit into this package. These are:
### Package we maintain:
- [senamhiR: A Collection of Functions to Obtain Peruvian Climate Data](https://gitlab.com/ConorIA/senamhiR/)
- [canadaHCDx: Additional functions for the canadaHCD package](https://gitlab.com/ConorIA/canadaHCDx/)
### Packages we use:
- [canadaHCD: Canadian Historical Climate Data](https://github.com/gavinsimpson/canadaHCD/) by [Gavin L. Simpson](https://github.com/gavinsimpson)
Contributing
------------
The members of the Climate Lab are, first and foremost, academics and researchers, not coders. If you would like to contribute code improvements or patches to any of the functions here, please feel free to open a [merge request](https://gitlab.com/ConorIA/claut/merge_requests). Please note that code you contribute will be attributed to you, and will be released under the terms of the [GNU GPLv3](https://gitlab.com/ConorIA/claut/blob/master/LICENSE.md).
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataEliminator.R
\name{dataEliminator}
\alias{dataEliminator}
\title{Eliminate daily values from a monthly data set}
\usage{
dataEliminator(month, csv, NAs, sampling = "z", variables = c("max", "min",
"mean"), simplify = FALSE, interpolate = FALSE)