missing_value_lab.R 4.3 KB
Newer Older
1 2
#' Eliminate daily values within months of a data set and report the results
#'
Conor Anderson's avatar
Conor Anderson committed
3
#' @param monthin data.frame; a data.frame with a single year-month of data with no missing values
4
#' @param NAs numeric; a vector of the number of NA values to test
Conor Anderson's avatar
Conor Anderson committed
5
#' @param no_of_tests integer; the number of tests to run; for consecutive sampling, tops at N - k + 1
6 7
#' @param sampling character; the type of sampling to use: (r)andom, (c)onsecutive
#' @param variables character; the names of the variables to test (we will try to auto-idenify the column number)
Conor Anderson's avatar
Conor Anderson committed
8
#' @param cores numeric; the number of cores to parallize over. Defaults to \code{parallel::detectCores() - 1}
9
#'
10
#' @importFrom dplyr %>% bind_rows left_join summarize_all
Conor Anderson's avatar
Conor Anderson committed
11
#' @importFrom iterators icount
Conor Anderson's avatar
Conor Anderson committed
12
#' @importFrom foreach %do% %dopar% foreach
Conor Anderson's avatar
Conor Anderson committed
13 14
#' @importFrom doParallel registerDoParallel
#' @importFrom parallel detectCores makeCluster stopCluster
15 16
#' @importFrom reshape2 melt
#' @importFrom rlang .data
17 18 19 20 21 22 23
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @importFrom zoo na.approx
#'
#' @export
#'

Conor Anderson's avatar
Conor Anderson committed
24
missing_data_lab <- function(monthin, NAs, sampling = "z", no_of_tests = 1, variables = c("max", "min", "mean"), cores) {
25 26 27 28 29 30 31 32 33

  # Get the range of `k` and sampling type
  if (missing(NAs)) NAs <- as.numeric(readline(prompt = "Enter vector of values of `k` to test: "))
  while (sampling != "c" & sampling != "r") {
    sampling <- readline(prompt = "Do you want NAs to be generated (r)andomly, or (c)onsecutively? ")
    if (sampling != "c" & sampling != "r") print("You must choose either (c) or (r).")
  }

  for (var in seq_along(variables)) {
Conor Anderson's avatar
Conor Anderson committed
34
    variables[var] <- grep(variables[var], names(monthin), ignore.case = TRUE)
35 36 37 38
  }
  variables <- as.numeric(variables)

  # First we generate our random data points that will be nullified
Conor Anderson's avatar
Conor Anderson committed
39
  NAvectors <- foreach(k=NAs, .combine = "append") %do% {
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
    if (sampling == "r") {
      NAvec <- replicate(no_of_tests, list(sample(1:nrow(monthin), size = k, replace = FALSE)))
    }
    if (sampling == "c") {
      if (nrow(monthin) - k + 1 < no_of_tests) {
        NAvec <- lapply(1:(nrow(monthin) - k + 1), function (x) eval(x:(x-1+k)))
      } else {
        samp_f <- function() {
          samp <- sample(1:(nrow(monthin)+1-k), size = 1, replace = FALSE)
          list(seq(samp, samp-1+k))
        }
        NAvec <- replicate(no_of_tests, samp_f())
      }
    }
    NAvec
  }

Conor Anderson's avatar
Conor Anderson committed
57 58 59 60 61 62
  if (missing(cores)) {
    cores <- detectCores() - 1
  }
  cl <- makeCluster(cores, outfile = "")
  registerDoParallel(cl)
  pb <- txtProgressBar(0, length(NAvectors), style = 3)
63

64 65
  results <- foreach(i = icount(length(NAvectors)),
                     .packages = c("dplyr", "foreach", "tibble", "reshape2", "rlang", "zoo")) %dopar% {
66

Conor Anderson's avatar
Conor Anderson committed
67
    NAs <- unlist(NAvectors[i])
68 69 70 71 72 73 74 75 76 77 78

    month <- foreach(var=variables, .combine = rbind) %do% {
      month <- monthin[,c(1, var)]
      mod_var <- month[[2]]
      mod_var[NAs] <- NA
      month <- tibble(Date = month[[1]], var = colnames(month)[2], Intact = month[[2]], wMiss = mod_var, LinInt = na.approx(mod_var, na.rm = FALSE), SplInt = na.spline(mod_var))
      month
    }

    summary <- month %>%
      melt(id.vars = c("Date", "var"), variable.name = "treatment") %>%
79
      group_by(yearmon = as.yearmon(.data$Date, format = "%Y-%m-%d"), var, treatment) %>%
80 81 82 83 84 85
      select(-`Date`) %>%
      summarize_all(c(mean = mean.default, sd = sd), na.rm = TRUE) %>%
      mutate(k = length(NAs), days = list(NAs))

    summary <- summary %>%
      left_join(., subset(summary, treatment == "Intact", select = c(-`treatment`, -`k`, -`days`)), by = c("yearmon", "var")) %>%
86
      mutate(err = abs(.data$mean.x - .data$mean.y), prop = .data$err / .data$sd.y) %>%
Conor Anderson's avatar
Conor Anderson committed
87
      select(yearmon, var, k, days, treatment, mean = mean.x, sd = sd.x, -`mean.y`, -`sd.y`, err, prop)
88 89 90

    summary <- summary %>%
      left_join(., subset(summary, treatment == "wMiss", select = c(-`treatment`, -`k`, -`days`)), by = c("yearmon", "var")) %>%
91
      mutate(change_real = .data$err.y - .data$err.x, change_prop = .data$prop.y - .data$prop.x) %>%
Conor Anderson's avatar
Conor Anderson committed
92
      select(yearmon, var, k, days, treatment, mean = mean.x, sd = sd.x, -`mean.y`, -`sd.y`, err = err.x, prop = prop.x, change_real, change_prop)
93

Conor Anderson's avatar
Conor Anderson committed
94
    setTxtProgressBar(pb, i)
95 96 97
    list(summary = summary, details = month)
  }

Conor Anderson's avatar
Conor Anderson committed
98 99
  stopCluster(cl)
  suppressWarnings(list(summary = bind_rows(lapply(results, "[[", 1)), details = bind_rows(lapply(results, "[[", 2))))
100
}