Closed tomaszaba closed 2 months ago
I have managed to make compute_wfhz_prevalence()
work as it should - depending on the SD's classification.
#'
#'
compute_wfhz_prevalence <- function(df,
.wt = NULL,
.edema = NULL,
.summary_by) {
.summary_by <- rlang::enquo(.summary_by)
## An empty vector type of type list ----
results <- list()
## Get summary of standard deviation classification ----
if (!rlang::quo_is_null(.summary_by)) {
x <- summarise(
df,
std = classify_sd(sd(remove_flags(.data$wfhz, "zscore"), na.rm = TRUE)),
.by = !!.summary_by
)
} else {
x <- summarise(
df,
std = classify_sd(sd(remove_flags(.data$wfhz, "zscore"), na.rm = TRUE))
)
}
## Iterate over data frame to compute prevalence according to the SD ----
for (i in seq_len(nrow(x))) {
if (!rlang::quo_is_null(.summary_by)) {
area <- dplyr::pull(x, !!.summary_by)[i]
data <- filter(df, !!sym(rlang::quo_name(.summary_by)) == !!area)
} else {
data <- df
}
std <- x$std[i]
if (std != "Problematic") {
### Compute standard complex sample based prevalence analysis ----
result <- get_wfhz_prevalence_estimates(data, {{ .wt }}, {{ .edema }}, !!.summary_by)
} else {
if (!rlang::quo_is_null(.summary_by)) {
### Compute PROBIT based prevalence ----
result <- compute_probit_prevalence(data, !!.summary_by)
} else {
result <- compute_probit_prevalence(data)
}
}
results[[i]] <- result
}
dplyr::bind_rows(results) |>
dplyr::relocate(gam_p, .after = gam_n) |>
dplyr::relocate(sam_p, .after = sam_n) |>
dplyr::relocate(mam_p, .after = mam_n)
}
It has a helper function for when SD is problematic
compute_probit_prevalence <- function(df, .summary_by = NULL) {
.summary_by <- rlang::enquo(.summary_by)
if(!is.null(.summary_by)) {
df <- summarise(
df,
gam = apply_probit_method(.data$wfhz, .status = "gam"),
sam = apply_probit_method(.data$wfhz, .status = "sam"),
mam = gam - sam,
.by = !!.summary_by
) |>
mutate(
gam_p = gam, sam_p = sam, mam_p = mam,
gam = NA, sam = NA, mam = NA
) |>
dplyr::select(!2:4)
} else {
df <- summarise(
df,
gam = apply_probit_method(.data$wfhz, .status = "gam"),
sam = apply_probit_method(.data$wfhz, .status = "sam"),
mam = gam - sam
) |>
mutate(
gam_p = gam, sam_p = sam, mam_p = mam,
gam = NA, sam = NA, mam = NA
) |>
dplyr::select(!2:4)
}
df
}
It returns the output:
# A tibble: 4 × 17
district gam_n gam_p gam_p_low gam_p_upp gam_p_deff sam_n sam_p sam_p_low sam_p_upp sam_p_deff mam_n
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Metuge NA 0.0260 NA NA NA NA 0.00163 NA NA NA NA
2 Cahora-Ba… 25 0.0738 0.0348 0.113 Inf 4 0.00336 -0.00348 0.0102 Inf 21
3 Chiuta 11 0.0444 0.0129 0.0759 Inf 2 0.00444 -0.00466 0.0136 Inf 9
4 Maravia NA 0.0383 NA NA NA NA 0.00279 NA NA NA NA
Now I am working to make compute_muac_prevalence()
work the same way. It is behaving differently when I apply the same approach as that of wfhz.
How do you find this code?
Hi again, Ernest,
The action to make compute_muac_prevalence()
is now completed.
I approached it this way:
compute_muac_prevalence <- function(df,
.wt = NULL,
.edema = NULL,
.summary_by = NULL) {
.summary_by <- rlang::enquo(.summary_by)
results <- list()
## Get and classify age ratio and standard deviation ----
if (!rlang::quo_is_null(.summary_by)) {
x <- df |>
group_by(!!.summary_by) |>
summarise(
age_ratio = classify_age_sex_ratio(age_ratio_test(.data$age, .expectedP = 0.66)$p),
std = classify_sd(sd(remove_flags(as.numeric(.data$mfaz), "zscore"), na.rm = TRUE)),
analysis_approach = tell_muac_analysis_strategy(.data$age_ratio, .data$std),
.groups = "drop"
)
} else {
x <- df |>
summarise(
age_ratio = classify_age_sex_ratio(age_ratio_test(.data$age, .expectedP = 0.66)$p),
std = classify_sd(sd(remove_flags(as.numeric(.data$mfaz), "zscore"), na.rm = TRUE)),
analysis_approach = tell_muac_analysis_strategy(.data$age_ratio, .data$std)
)
}
## Iterate over data frame to compute prevalence according to analysis_approach ----
for (i in seq_len(nrow(x))) {
if (!rlang::quo_is_null(.summary_by)) {
area <- dplyr::pull(x, !!.summary_by)[i]
data <- filter(df, !!sym(rlang::quo_name(.summary_by)) == area)
} else {
data <- df
}
analysis_approach <- x$analysis_approach[i]
if (analysis_approach == "unweighted") {
### Compute standard complex sample based prevalence analysis ----
output <- compute_pps_based_muac_prevalence(data, {{ .wt }}, {{ .edema }}, !!.summary_by)
} else if (analysis_approach == "weighted") {
if (!rlang::quo_is_null(.summary_by)) {
### Compute weighted prevalence summarized at .summary_by ----
output <- compute_weighted_prevalence(data, .edema = {{ .edema }}, !!.summary_by)
} else {
### Compute non-summarized weighted prevalence ----
output <- compute_weighted_prevalence(data, .edema = {{ .edema }})
}
} else {
if (!rlang::quo_is_null(.summary_by)) {
### Add NA's to the summarized tibble accordingly ----
output <- summarise(
data,
gam_p = NA_real_,
sam_p = NA_real_,
mam_p = NA_real_,
.by = !!.summary_by
)
} else {
### Return a non-summarised tibble ----
output <- tibble::tibble(
gam_p = NA_real_,
sam_p = NA_real_,
mam_p = NA_real_
)
}
}
results[[i]] <- output
}
### Ensure that all geographical aras are added to the tibble ----
if (!rlang::quo_is_null(.summary_by)) {
results <- dplyr::bind_rows(results) |>
dplyr::relocate(.data$gam_p, .after = .data$gam_n) |>
dplyr::relocate(.data$sam_p, .after = .data$sam_n) |>
dplyr::relocate(.data$mam_p, .after = .data$mam_n)
} else {
results <- dplyr::bind_rows(results)
}
results
}
This returns:
# A tibble: 3 × 17
province gam_n gam_p gam_p_low gam_p_upp gam_p_deff sam_n sam_p sam_p_low sam_p_upp sam_p_deff mam_n
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Province 1 135 0.104 0.0778 0.130 Inf 19 0.0133 0.00682 0.0198 Inf 116
2 Province 2 NA 0.112 NA NA NA NA 0.0201 NA NA NA NA
3 Province 3 NA NA NA NA NA NA NA NA NA NA NA
# ℹ 5 more variables: mam_p <dbl>, mam_p_low <dbl>, mam_p_upp <dbl>, mam_p_deff <dbl>, wt_pop <dbl>
In each prevalence function, before the prevalence is computed, few checks are done to inform the prevalence analysis strategy, for instance in
compute_muac_prevalence()
first standard deviation of MFAZ and age ratio test are performed, then a new object is created thanks totell_muac_analysis_strategy()
which evaluates the analysis strategy to follow - wether normal complex sample-based analysis (if standard deviation and age ratio test are not problematic) or CDC weighting-approach (if standard deviation is not problematic but age ratio is problematic) or no computation when either standard deviation is problematic and age ratio is not or both.I've just realised that current code only works well for one a data frame of one survey. If multiple survey areas are available, the decision on the analysis approach is made based on ungrouped data, which is wrong, as one survey area may required weighted analysis or not. This errors comes from the way how I approached the check inside the functions. I have started to revise them. I am currently approaching this way:
this gives me something like:
I would like to have your support on how to approach the downstream code that should read each element of the vector
analysis_approach
. I immediately thought of using a for loop, but I am not yet good at writing a for or while loop.Could please give a hand? Please consider the code in my last commit under branch
test_suit
Note: I am aware that all prevalence functions are absolutely too long. I have a plan to shorten them by writing a a separate function for each code block. I want to make this work first, then I will revise.