Commit d2a3693d authored by Conor Anderson's avatar Conor Anderson

Improve baseline and recodes in find_station()

parent cca75719
Pipeline #46421867 failed with stages
in 21 minutes
Package: canadaHCDx
Type: Package
Title: Additional functions for the canadaHCD package
Version: 0.0.7
Date: 2019-02-02
Version: 0.0.8
Date: 2019-02-06
Authors@R: c(person(given = c("Conor", "I."), family = "Anderson",
role = c("aut","cre"), email = "conor.anderson@utoronto.ca"),
person(given = c("William", "A."), family = "Gough", role = "ths",
......
......@@ -6,9 +6,9 @@
#' @param ignore.case logical; by default the search for station names is not case-sensitive.
#' @param glob logical; use wildcards in station name as detailed in \code{link{glob2rx}}.
#' @param province character; optional character string to filter by a given province. Use full name or two-letter code, e.g. ON for Ontario.
#' @param baseline vector; optional vector with a start and end year for a desired baseline.
#' @param period vector; optional vector with a start and end year for a desired period of data
#' @param type character; period columns to return. \code{NULL} (default) returns hourly, daily, and monthly.
#' @param duplicates Boolean; if TRUE, will attempt to provide combinations of stations (at the same coordinates) that provide enough baseline data.
#' @param recodes Boolean; if TRUE, will attempt to provide combinations of stations (at the same coordinates) that provide enough data to cover the period.
#' @param target numeric; optional numeric value of the target (reference) station, or a vector of length 2 containing latitude and longitude (in that order).
#' @param dist numeric; vector with a range of distance from the target in km. Only used if a target is specified. (default is 0:100)
#' @param sort Boolean; if TRUE (default), will sort the resultant table by distance from `target`. Only used if a target is specified.
......@@ -31,21 +31,21 @@
#' find_station("Reg*", glob = TRUE)
#'
#' # Find stations named "Yellowknife", with hourly data available from 1971 to 2000.
#' find_station("Yellowknife", baseline = c(1971, 2000), type = "hourly")
#' find_station("Yellowknife", period = c(1971, 2000), type = "hourly")
#'
#' # Find all stations between 0 and 100 km from Station No. 5051.
#' find_station(target = 5051, dist = 0:100)
find_station <- function(name = NULL, ignore.case = TRUE, glob = FALSE,
province = NULL, baseline = NULL, type = "daily",
duplicates = FALSE, target = NULL, dist = 0:100,
province = NULL, period = NULL, type = "daily",
recodes = FALSE, target = NULL, dist = 0:100,
sort = TRUE, assume_yes = FALSE, ...) {
filt <- get_station_data(assume_yes, quiet = TRUE)
# These seem to be erroneous coords for 20 stations
filt$LatitudeDD[filt$LatitudeDD == 40] <- NA
filt$LongitudeDD[filt$LongitudeDD == -50] <- NA
# These seem to be erroneous coords for some stations
filt$LatitudeDD[filt$LatitudeDD <= 0] <- NA
filt$LongitudeDD[filt$LongitudeDD >= 0] <- NA
# If `name` is not NULL, filter by name
if (!is.null(name)) {
......@@ -59,7 +59,7 @@ find_station <- function(name = NULL, ignore.case = TRUE, glob = FALSE,
if (!is.null(province)) {
p_codes <- c("AB", "BC", "MB", "NB", "NL", "NT", "NS", "NU", "ON", "PE",
"QC", "SK", "YT")
# Identify all stations outside of our baseline
# Identify all stations outside of our period
if (all(nchar(province) == 2L)) {
if (!all(province %in% p_codes)) stop("Incorrect province code(s) provided.")
province <- levels(as.factor(filt$Province))[which(p_codes %in% toupper(province))]
......@@ -72,11 +72,14 @@ find_station <- function(name = NULL, ignore.case = TRUE, glob = FALSE,
}
# Next, set the data we are interested in, if necessary
if (!is.null(baseline)) {
if (baseline[1] > baseline[length(baseline)]) stop("Baseline not chronological.")
if (!is.null(period)) {
if (period[1] > period[length(period)]) {
warning("Period not chronological. Using min value to max value.")
period <- min(period):max(period)
}
if (is.null(type)) {
warning("We need to know the data type. Baseline ignored.")
baseline <- NULL
warning("We need to know the data type. Period ignored.")
period <- NULL
}
}
......@@ -85,9 +88,9 @@ find_station <- function(name = NULL, ignore.case = TRUE, glob = FALSE,
warning("One of more types invalid. Omitting.")
type <- type[type %in% c("hourly", "daily", "monthly")]
}
if (!is.null(baseline) & length(type) > 1) {
warning("We can only filter by one type at a time. Baseline ignored.")
baseline <- NULL
if (!is.null(period) & length(type) > 1) {
warning("We can only filter by one type at a time. Period ignored.")
period <- NULL
}
data_vars <- NULL
if ("hourly" %in% type) data_vars <- c(data_vars, "HourlyFirstYr", "HourlyLastYr")
......@@ -116,33 +119,38 @@ find_station <- function(name = NULL, ignore.case = TRUE, glob = FALSE,
attr(filt, "target_lat") <- p1[2]
}
# If `baseline` is not NULL, filter by available data
if (!is.null(baseline)) {
# If `period` is not NULL, filter by available data
if (!is.null(period)) {
inside <- filter_at(filt, vars(matches(data_vars[1])), all_vars(. <= min(baseline))) %>%
filter_at(vars(matches(data_vars[2])), all_vars(. >= max(baseline)))
inside <- filter_at(filt, vars(matches(data_vars[1])), all_vars(. <= min(period))) %>%
filter_at(vars(matches(data_vars[2])), all_vars(. >= max(period)))
# Keep a record of the stations that are outside of our baseline
# Keep a record of the stations that are outside of our period
outside <- filter(filt, !(.data$StationID %in% inside$StationID))
filt <- inside
# Remind users that stations for which the ID has changed might not be detected
if (duplicates == TRUE) {
coords <- outside %>% group_by(.data$LatitudeDD, .data$LongitudeDD) %>% tally %>% filter(.data$n > 1)
if (recodes == TRUE) {
coords <- outside %>% group_by(.data$LatitudeDD, .data$LongitudeDD) %>% tally %>% filter(.data$n > 1) %>%
filter(!any(is.na(c(LatitudeDD, LongitudeDD))))
printed <- NULL
for (rw in 1:nrow(coords)) {
dups <- filter(outside, .data$LatitudeDD == coords$LatitudeDD[rw] & .data$LongitudeDD == coords$LongitudeDD[rw])
if (isTRUE(sort)) dups <- arrange(dups, .data$StationID)
if (!is.na(min(dups[6])) & min(dups[6]) <= min(baseline) & !is.na(max(dups[7])) & max(dups[7] >= max(baseline))) {
dups <- filter(outside, .data$LatitudeDD == coords$LatitudeDD[rw] & .data$LongitudeDD == coords$LongitudeDD[rw]) %>%
arrange(.data$StationID)
if (!is.na(min(dups[6])) & min(dups[6]) <= min(period) & !is.na(max(dups[7])) & max(dups[7] >= max(period))) {
if (is.null(printed)) {
cat("Note: In addition to the stations found, the following",
"combinations may provide sufficient baseline data.\n\n")
"combinations may provide sufficient period data.\n\n")
printed <- 1
combo <- 1
}
cat(">> Combination", combo, "at coordinates", coords$LatitudeDD[rw],
coords$LongitudeDD[rw], "\n\n")
cat(paste0(dups$StationID, ": ", dups$Name, "\n"), "\n", sep = "")
sprintfs <- paste0("%-", sprintf("%02d", c(max(nchar(dups$StationID)), max(nchar(dups$Name)))), "s")
cat(paste0(sprintf(sprintfs[1], dups$StationID), " : ",
sprintf(sprintfs[2], dups$Name),
" (", dups[[data_vars[1]]],
"\u2012", dups[[data_vars[2]]], ")\n"), "\n", sep = "")
combo <- combo + 1
}
}
......
......@@ -14,7 +14,7 @@ pears <- hcd_daily(5097, 1993)
[![build status](https://gitlab.com/ConorIA/canadaHCDx/badges/master/build.svg)](https://gitlab.com/ConorIA/canadaHCDx/pipelines) [![Build status](https://ci.appveyor.com/api/projects/status/meb87c4uik14wcyj?svg=true)](https://ci.appveyor.com/project/ConorIA/canadahcdx) [![codecov](https://codecov.io/gl/ConorIA/canadaHCDx/branch/master/graph/badge.svg)](https://codecov.io/gl/ConorIA/canadaHCDx)
Environment and Climate Change Canada (ECCC) provides archival climate data for a [wealth of climate stations](ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv) across the country. This data is available for download on the ECCC [data portal](http://climate.weather.gc.ca/). This website is useful for browsing data, and is convenient for downloading short periods of data. However, if you are seeking a 30-year baseline, the manual download of the necessary data can become very tedious. In that regard, ECCC also provides a bulk download function to make acquiring that data a little bit easier. While the process is much simpler than it has been in the past, one still must put in a little leg work to:
Environment and Climate Change Canada (ECCC) provides archival climate data for a [wealth of climate stations](ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv) across the country. This data is available for download on the ECCC [data portal](http://climate.weather.gc.ca/). This website is useful for browsing data, and is convenient for downloading short periods of data. However, if you are seeking a 30-year period, the manual download of the necessary data can become very tedious. In that regard, ECCC also provides a bulk download function to make acquiring that data a little bit easier. While the process is much simpler than it has been in the past, one still must put in a little leg work to:
1) Identify the station of interest
2) Find the correct data set
......@@ -86,7 +86,7 @@ This package will download the official Environment and Climate Change Canada [s
## Advanced search
This packages masks `find_station()` from `canadaHCD` with a version that includes the ability to search for stations by name (as per the original), but adds filters to search by province, by a given baseline period, and by proximity to another station or a vector of coordinates. You can use any combination of these four filters in your search. There are a few mandatory arguments for each filter. For instance, if you are searching for a certain baseline period, you must also include the type of data you are looking for (hourly, daily, or monthly; defaults to daily). The function is fully documented, so take a look at `?find_station`. Let's look at some examples.
This packages masks `find_station()` from `canadaHCD` with a version that includes the ability to search for stations by name (as per the original), but adds filters to search by province, by a given period, and by proximity to another station or a vector of coordinates. You can use any combination of these four filters in your search. There are a few mandatory arguments for each filter. For instance, if you are searching for a certain period of data, you must also include the type of data you are looking for (hourly, daily, or monthly; defaults to daily). The function is fully documented, so take a look at `?find_station`. Let's look at some examples.
### Find all stations in the province of Ontario
```{r}
......@@ -97,7 +97,7 @@ _Note: You can use a vector of more than on province, e.g. `province = c("ON", "
### Find stations named "Toronto", with hourly data available from 1971 to 2000
```{r}
find_station("Toronto", baseline = 1971:2000, type = "hourly")
find_station("Toronto", period = 1971:2000, type = "hourly")
```
### Find all stations between 0 and 100 km from Station No. 5051
```{r}
......@@ -110,10 +110,10 @@ find_station(target = c(43.7860, -79.1873), dist = 0:5)
### Identify stations that have changed name and ID
There have been a number of cases where the same station has changed name and ID over its history. In this case, filtering by baseline might exclude these stations. If you would like the `find_station_adv()` function to try to identify these cases, pass `duplicates = TRUE` to the function. The function will report any combination for which the coordinates are the same, and which, together, provide sufficient baseline data.
There have been a number of cases where the same station has changed name and ID over its history. In this case, filtering by period might exclude these stations. If you would like the `find_station_adv()` function to try to identify these cases, pass `recodes = TRUE` to the function. The function will report any combination for which the coordinates are the same, and which, together, provide sufficient period data.
```{r}
find_station(baseline = 1981:2010, target = 5051, dist = 0:10, duplicates = TRUE)
find_station(period = 1981:2010, target = 5051, dist = 0:10, recodes = TRUE)
```
## Map function
......
This diff is collapsed.
......@@ -5,9 +5,9 @@
\title{Find a Historical Climate Data station (advanced)}
\usage{
find_station(name = NULL, ignore.case = TRUE, glob = FALSE,
province = NULL, baseline = NULL, type = "daily",
duplicates = FALSE, target = NULL, dist = 0:100, sort = TRUE,
assume_yes = FALSE, ...)
province = NULL, period = NULL, type = "daily", recodes = FALSE,
target = NULL, dist = 0:100, sort = TRUE, assume_yes = FALSE,
...)
}
\arguments{
\item{name}{character; optional character vector or a regular expression to be matched against known station names. See \code{\link{grep}} for details.}
......@@ -18,11 +18,11 @@ find_station(name = NULL, ignore.case = TRUE, glob = FALSE,
\item{province}{character; optional character string to filter by a given province. Use full name or two-letter code, e.g. ON for Ontario.}
\item{baseline}{vector; optional vector with a start and end year for a desired baseline.}
\item{period}{vector; optional vector with a start and end year for a desired period of data}
\item{type}{character; period columns to return. \code{NULL} (default) returns hourly, daily, and monthly.}
\item{duplicates}{Boolean; if TRUE, will attempt to provide combinations of stations (at the same coordinates) that provide enough baseline data.}
\item{recodes}{Boolean; if TRUE, will attempt to provide combinations of stations (at the same coordinates) that provide enough data to cover the period.}
\item{target}{numeric; optional numeric value of the target (reference) station, or a vector of length 2 containing latitude and longitude (in that order).}
......@@ -45,7 +45,7 @@ Search for stations in the Historical Climate Data inventory name, available dat
find_station("Reg*", glob = TRUE)
# Find stations named "Yellowknife", with hourly data available from 1971 to 2000.
find_station("Yellowknife", baseline = c(1971, 2000), type = "hourly")
find_station("Yellowknife", period = c(1971, 2000), type = "hourly")
# Find all stations between 0 and 100 km from Station No. 5051.
find_station(target = 5051, dist = 0:100)
......
......@@ -12,9 +12,9 @@ test_that("find_station() can locate a station by name regex", {
expect_output(str(df), "7 variables")
})
## test finding a station by baseline
test_that("find_station() can locate a station by baseline", {
df <- find_station(baseline = 1840:2015, duplicates = TRUE, assume_yes = TRUE)
## test finding a station by period
test_that("find_station() can locate a station by period", {
df <- find_station(period = 1840:2015, recodes = TRUE, assume_yes = TRUE)
expect_that(df, is_a("hcd_station_list"))
expect_that(df, is_a("tbl_df"))
expect_output(str(df), "1 obs")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment