diff --git a/.azure/prepare.bash b/.azure/prepare.bash index cd1c77da..d5a61dc5 100644 --- a/.azure/prepare.bash +++ b/.azure/prepare.bash @@ -6,5 +6,5 @@ mkdir -p "${R_LIBS_USER}" Rscript -e "install.packages(c('git2r', 'covr', 'withr', 'devtools', 'lintr', 'mockery'), repos='https://cloud.r-project.org', lib='${R_LIBS_USER}')" Rscript -e "install.packages(c('DSI', 'metafor', 'meta'), repos='https://cloud.r-project.org', lib='${R_LIBS_USER}')" -Rscript -e "withr::with_libpaths(new = '${R_LIBS_USER}', devtools::install_github('datashield/dsBaseClient', ref='6.1.0'))" +Rscript -e "withr::with_libpaths(new = '${R_LIBS_USER}', devtools::install_github('datashield/dsBaseClient', ref='v6.2-dev'))" cd "${BUILD_REPOSITORY_LOCALPATH}" \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 2fa71524..cadcf5c3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dsHelper Type: Package Title: Helper Functions for Use with 'DataSHIELD' -Version: 0.1.2.9000 +Version: 1.0.0.9000 Description: Often we need to automate things with 'DataSHIELD'. These functions help to do that. Authors@R: c(person(given= "Tim", @@ -22,7 +22,7 @@ Maintainer: Tim Cadman Depends: R (>= 3.5.0) Imports: - dsBaseClient (>= 6.1.0), + dsBaseClient (>= 6.2.0), dplyr (>= 1.0.2), tibble, purrr, diff --git a/NAMESPACE b/NAMESPACE index 884e9b69..2a30aab8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(dh.defineCases) export(dh.dropCols) export(dh.findVarsIndex) export(dh.getAnonPlotData) +export(dh.getRmStats) export(dh.getStats) export(dh.lmTab) export(dh.lmeMultPoly) @@ -55,6 +56,7 @@ importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,vars) importFrom(dsBaseClient,ds.Boole) +importFrom(dsBaseClient,ds.asFactorSimple) importFrom(dsBaseClient,ds.asNumeric) importFrom(dsBaseClient,ds.assign) importFrom(dsBaseClient,ds.cbind) @@ -79,6 +81,9 @@ importFrom(dsBaseClient,ds.reShape) importFrom(dsBaseClient,ds.replaceNA) importFrom(dsBaseClient,ds.rm) importFrom(dsBaseClient,ds.scatterPlot) +importFrom(dsBaseClient,ds.summary) +importFrom(dsBaseClient,ds.tapply) +importFrom(dsBaseClient,ds.tapply.assign) importFrom(magrittr,"%<>%") importFrom(metafor,rma) importFrom(purrr,cross2) diff --git a/R/get-rm-stats.R b/R/get-rm-stats.R new file mode 100644 index 00000000..e1c6b35b --- /dev/null +++ b/R/get-rm-stats.R @@ -0,0 +1,136 @@ +#' Produces descriptive statistics based on repeated measures data +#' which it would be useful to report in papers. +#' +#' @importFrom dplyr %>% mutate across +#' @importFrom dsBaseClient ds.summary ds.asFactorSimple ds.tapply.assign ds.tapply +#' +#' @param df datashield dataframe +#' @param outcome name of outcome variable in df +#' @param id_var name of id variable in df +#' @param age_var name of age variable in df +#' @param conns connection object for DataSHIELD backends +#' +#' @return a tibble containing the following columns: +#' +#' min_age: 5th percentile of age +#' max_age: 95th percentile of age +#' n_obs: total number of observations in data +#' n_participants: total number of unique participants +#' n_meas_5: 5th percentile of measurements per individual +#' n_meas_med: median number of measurements per individual +#' n_meas_95: 95th percentile of measurements per individual +#' +#' @export +dh.getRmStats <- function(df = NULL, outcome = NULL, id_var = NULL, age_var = NULL, conns = NULL) { + . <- n_meas_5 <- n_meas_95 <- n_meas_med <- variable <- perc_5 <- perc_95 <- cohort <- min_age <- + max_age <- valid_n <- NULL + + if (is.null(df)) { + stop("Please provide the name of a datashield dataframe") + } + + if (is.null(outcome)) { + stop("Please provide the name of your outcome variable") + } + + if (is.null(id_var)) { + stop("Please provide the name of id variable in df") + } + + if (is.null(age_var)) { + stop("Please provide the name of your age variable in df") + } + + if (is.null(conns)) { + conns <- datashield.connections_find() + } + + ## ---- First get overall stats for some of the easy ones ------------------------------------------- + stats <- dh.getStats( + df = df, + vars = c(outcome, age_var), + conns = conns + ) + + ## ---- Age range of participants ------------------------------------------------------------------- + age_ranges <- stats$continuous %>% + dplyr::filter(variable == age_var) %>% + mutate( + min_age = perc_5, + max_age = perc_95 + ) %>% + dplyr::select(cohort, min_age, max_age) + + ## ---- Total number of outcome measurements ------------------------------------- + outcome_n <- stats$continuous %>% + dplyr::filter(variable == outcome) %>% + dplyr::select(cohort, n_obs = valid_n) + + + ## ---- Total number of unique participants ---------------------------------------- + + # First, we use ds.tapply.assign to summarise the number of observations for each + # subject. The length of this created object then gives us the number of subjects. + + ds.asFactorSimple(paste0(df, "$", id_var), "id_fact", datasources = conns) + + ds.tapply.assign( + X.name = "data$weight", + INDEX.names = "id_fact", + FUN.name = "N", + newobj = "id_summary", + datasources = conns + ) + + n_subjects <- DSI::datashield.aggregate(conns, call("lengthDS", "id_summary$N")) %>% + setNames(names(conns)) %>% + bind_rows() %>% + mutate(combined = rowSums(.)) %>% + pivot_longer( + cols = everything(), + names_to = "cohort", + values_to = "n_participants" + ) + + ## ---- Median number of weight measurements per child ---------------------------------------- + + # We can use the ds.quantileMean function with the object we created above to get the + # median number of measurements per child. + + ds.asNumeric("id_summary$N", "id_summary_num", datasources = conns) + + quants <- DSI::datashield.aggregate(conns, as.symbol("quantileMeanDS(id_summary_num)")) + + weight_med_iqr <- quants %>% + bind_rows(.id = "cohort") %>% + select(cohort, "5%", "50%", "95%") %>% + rename(n_meas_med = "50%", n_meas_5 = "5%", n_meas_95 = "95%") + + ## Get the combined version using weighted sum + lengths <- DSI::datashield.aggregate(conns, call("lengthDS", "id_summary_num")) + numNAs <- DSI::datashield.aggregate(conns, "numNaDS(id_summary_num)") + + valid_n <- list(lengths, numNAs) %>% pmap(~ .x - .y) + + weights <- unlist(valid_n) / sum(unlist(valid_n)) + + weighted_quant <- list(quants, weights) %>% pmap(~ .x * .y) + + sum_quant <- weighted_quant %>% + pmap(function(...) { + sum(c(...)) + }) %>% + bind_rows() %>% + rename(n_meas_med = "50%", n_meas_5 = "5%", n_meas_95 = "95%") %>% + mutate(cohort = "combined") %>% + select(cohort, n_meas_med, n_meas_5, n_meas_95) + + quant_out <- bind_rows(weight_med_iqr, sum_quant) + + ## ---- Create final output ------------------------------------------------------------------- + out <- left_join(age_ranges, outcome_n, by = "cohort") %>% + left_join(., n_subjects, by = "cohort") %>% + left_join(., quant_out, by = "cohort") + + return(out) +} diff --git a/R/tidy-env.R b/R/tidy-env.R index 6e81ae0d..5aa11e4c 100644 --- a/R/tidy-env.R +++ b/R/tidy-env.R @@ -62,7 +62,7 @@ dh.tidyEnv <- function(obj = NULL, type = c("remove", "keep"), conns = NULL) { bind_rows() vars_tibble %>% pmap(function(cohort, value) { - ds.rm(x.name = value, datasources = conns[cohort]) + ds.rm(x.names = value, datasources = conns[cohort]) }) } } diff --git a/man/dh.getRmStats.Rd b/man/dh.getRmStats.Rd new file mode 100644 index 00000000..12e8ab48 --- /dev/null +++ b/man/dh.getRmStats.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get-rm-stats.R +\name{dh.getRmStats} +\alias{dh.getRmStats} +\title{Produces descriptive statistics based on repeated measures data +which it would be useful to report in papers.} +\usage{ +dh.getRmStats( + df = NULL, + outcome = NULL, + id_var = NULL, + age_var = NULL, + conns = NULL +) +} +\arguments{ +\item{df}{datashield dataframe} + +\item{outcome}{name of outcome variable in df} + +\item{id_var}{name of id variable in df} + +\item{age_var}{name of age variable in df} + +\item{conns}{connection object for DataSHIELD backends} +} +\value{ +a tibble containing the following columns: + +min_age: 5th percentile of age +max_age: 95th percentile of age +n_obs: total number of observations in data +n_participants: total number of unique participants +n_meas_5: 5th percentile of measurements per individual +n_meas_med: median number of measurements per individual +n_meas_95: 95th percentile of measurements per individual +} +\description{ +Produces descriptive statistics based on repeated measures data +which it would be useful to report in papers. +}