8.6 Weighted binary logistic regression
8.6.1 Fitting the model
To carry out a binary logistic regression that incorporates a survey design, use svyglm()
with family=quasibinomial()
. This produces the same results as family=binomial()
but avoids a warning about non-integer numbers of successes. As with glm()
, svyglm()
models the probability that the outcome is at the non-reference level, if the outcome is a factor, or the probability that the outcome is 1, if the outcome is numeric with values 0 and 1 (see Section 6.6.2). Although not discussed in this text, survey-weighted ordinal logistic regression can be carried out using svyolr()
.
Example 8.3: Using data from adult participants in the 2019 NSDUH (nsduh2019_rmph.RData
), what is the association between lifetime marijuana use (mj_lifetime
) and age at first use of alcohol (alc_agefirst
), adjusted for age (demog_age_cat6
), sex (demog_sex
), and income (demog_income
)?
Since the question states “among adult participants”, use a domain analysis for this example. Our domain definition is based on one of the variables in the model (demog_age_cat6
) and excludes individuals at certain levels of that variable. This does not affect any of the analyses below, but does lead to an issue with tbl_regression()
as the excluded levels show up in the table, and show up multiple times. To avoid this problem, create a new version of this variable with missing values at the excluded levels and use fct_drop()
to drop the unused levels. This must be done before specifying and subsetting the design so the new variable is available to the design.
load("Data/nsduh2019_rmph.RData")
# Check the levels of the age variable
table(nsduh$demog_age_cat6)
##
## 12-17 18-25 26-34 35-49 50-64 65+
## 13397 14226 8601 11134 4880 3898
# Variables in this dataset are labeled
# which causes problems at times.
# Change class to just "factor" to avoid
# an error with na_if() below
class(nsduh$demog_age_cat6) <- "factor"
<- nsduh %>%
nsduh mutate(domain = demog_age_cat6 %in%
c("18-25", "26-34", "35-49", "50-64", "65+"),
# So tbl_regression works correctly:
# Set non-adult age group to missing
age_new = na_if(demog_age_cat6, "12-17"),
# Drop unused levels
age_new = fct_drop(age_new))
# Check derivation
table(nsduh$age_new, nsduh$demog_age_cat6, exclude = NULL)
##
## 12-17 18-25 26-34 35-49 50-64 65+
## 18-25 0 14226 0 0 0 0
## 26-34 0 0 8601 0 0 0
## 35-49 0 0 0 11134 0 0
## 50-64 0 0 0 0 4880 0
## 65+ 0 0 0 0 0 3898
## <NA> 13397 0 0 0 0 0
Next, specify the survey design and subset the design.
library(survey)
options(survey.lonely.psu = "adjust")
options(survey.adjust.domain.lonely = TRUE)
<- svydesign(strata=~vestr, id=~verep, weights=~ANALWT_C,
design.NSDUH nest=TRUE, data=nsduh)
<- subset(design.NSDUH, domain) design.NSDUH.adults
Use svyglm()
with family=quasibinomial()
to fit the model, and the same summary()
, confint()
, and car::Anova()
calls used for weighted linear regression to view the output.
.3 <- svyglm(mj_lifetime ~ alc_agefirst +
fit.ex8+ demog_sex + demog_income,
age_new family = quasibinomial(),
design = design.NSDUH.adults)
round(
cbind(
summary(fit.ex8.3,
df.resid=degf(fit.ex8.3$survey.design))$coef,
confint(fit.ex8.3,
ddf=degf(fit.ex8.3$survey.design))
)4) ,
## Estimate Std. Error t value Pr(>|t|) 2.5 % 97.5 %
## (Intercept) 4.8352 0.1433 33.743 0.0000 4.5474 5.1230
## alc_agefirst -0.2427 0.0074 -32.711 0.0000 -0.2576 -0.2278
## age_new26-34 0.2048 0.0456 4.490 0.0000 0.1132 0.2964
## age_new35-49 -0.1694 0.0429 -3.949 0.0002 -0.2556 -0.0832
## age_new50-64 -0.1030 0.0516 -1.996 0.0514 -0.2067 0.0007
## age_new65+ -0.7953 0.0565 -14.078 0.0000 -0.9088 -0.6819
## demog_sexFemale -0.0672 0.0310 -2.169 0.0348 -0.1294 -0.0050
## demog_income$20,000 - $49,999 -0.1559 0.0362 -4.302 0.0001 -0.2287 -0.0831
## demog_income$50,000 - $74,999 -0.0804 0.0559 -1.438 0.1567 -0.1928 0.0319
## demog_income$75,000 or more -0.1254 0.0430 -2.919 0.0053 -0.2117 -0.0391
::Anova(fit.ex8.3, type = 3, test.statistic = "F",
carerror.df = degf(fit.ex8.3$survey.design))
## Analysis of Deviance Table (Type III tests)
##
## Response: mj_lifetime
## Df F Pr(>F)
## (Intercept) 1 1138.61 < 0.0000000000000002 ***
## alc_agefirst 1 1070.01 < 0.0000000000000002 ***
## age_new 4 92.48 < 0.0000000000000002 ***
## demog_sex 1 4.71 0.03483 *
## demog_income 3 6.57 0.00079 ***
## Residuals 50
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Compute the AORs and their 95% confidence intervals.
# Compute odds ratios and their 95% confidence intervals
<- cbind("AOR" = exp( coef(fit.ex8.3)),
OR.CI exp(confint(fit.ex8.3,
df.resid=degf(fit.ex8.3$survey.design))))[-1,]
round(OR.CI, 3)
## AOR 2.5 % 97.5 %
## alc_agefirst 0.784 0.773 0.796
## age_new26-34 1.227 1.119 1.346
## age_new35-49 0.844 0.774 0.921
## age_new50-64 0.902 0.813 1.001
## age_new65+ 0.451 0.403 0.506
## demog_sexFemale 0.935 0.878 0.995
## demog_income$20,000 - $49,999 0.856 0.795 0.921
## demog_income$50,000 - $74,999 0.923 0.824 1.033
## demog_income$75,000 or more 0.882 0.809 0.962
To table these results, use the tbl_regression()
function in gtsummary
after adding test.statistic = "F"
in add_global_p()
. As mentioned in Section 8.4.1, there is no option in tbl_regression()
to change the DF to make the confidence intervals and p-values for the individual regression coefficients be based on the design DF. See the warning there for how to resolve this issue.
The results are shown in Table 8.6.
library(gtsummary)
<- fit.ex8.3 %>%
t1 tbl_regression(intercept = T,
# Use style_number to round to 2 digits (e.g., 1.27)
estimate_fun = function(x) style_number(x, digits = 2),
# Use style_sigfig to keep 2 significant digits (e.g., 1.3)
# estimate_fun = function(x) style_sigfig(x, digits = 2),
label = list('(Intercept)' ~ "Intercept",
~ "Age of first alcohol use (years)",
alc_agefirst ~ "Age (years)",
age_new ~ "Sex",
demog_sex ~ "Income")) %>%
demog_income modify_column_hide(p.value) %>%
modify_caption("Weighted logistic regression results for
lifetime marijuana use vs. age at first alcohol use (years)")
<- fit.ex8.3 %>%
t2 tbl_regression(intercept = F, # No intercept in OR table
exponentiate = T, # OR = exp(B)
# Use style_number to round to 2 digits (e.g., 1.27)
estimate_fun = function(x) style_number(x, digits = 2),
# Use style_sigfig to keep 2 significant digits (e.g., 1.3)
pvalue_fun = function(x) style_pvalue(x, digits = 3),
label = list(alc_agefirst ~ "Age of first alcohol use (years)",
~ "Age (years)",
age_new ~ "Sex",
demog_sex ~ "Income")) %>%
demog_income add_global_p(keep = T, test.statistic = "F"
# Uncomment to use design df for Type III test p-values
# (but then they will not match the regression term p-values for
# binary predictors)
# , error.df = degf(fit.ex8.1$survey.design)
)
tbl_merge(
tbls = list(t1, t2),
tab_spanner = c("**Regression Coefficients**", "**Adjusted Odds Ratio**")
)
Characteristic | Regression Coefficients | Adjusted Odds Ratio | |||
---|---|---|---|---|---|
log(OR)1 | 95% CI1 | OR1 | 95% CI1 | p-value | |
Intercept | 4.84 | 4.55, 5.12 | |||
Age of first alcohol use (years) | -0.24 | -0.26, -0.23 | 0.78 | 0.77, 0.80 | <0.001 |
Age (years) | <0.001 | ||||
18-25 | — | — | — | — | |
26-34 | 0.20 | 0.11, 0.30 | 1.23 | 1.12, 1.35 | <0.001 |
35-49 | -0.17 | -0.26, -0.08 | 0.84 | 0.77, 0.92 | <0.001 |
50-64 | -0.10 | -0.21, 0.00 | 0.90 | 0.81, 1.00 | 0.053 |
65+ | -0.80 | -0.91, -0.68 | 0.45 | 0.40, 0.51 | <0.001 |
Sex | 0.036 | ||||
Male | — | — | — | — | |
Female | -0.07 | -0.13, 0.00 | 0.94 | 0.88, 1.00 | 0.036 |
Income | <0.001 | ||||
Less than $20,000 | — | — | — | — | |
$20,000 - $49,999 | -0.16 | -0.23, -0.08 | 0.86 | 0.80, 0.92 | <0.001 |
$50,000 - $74,999 | -0.08 | -0.19, 0.03 | 0.92 | 0.82, 1.03 | 0.158 |
$75,000 or more | -0.13 | -0.21, -0.04 | 0.88 | 0.81, 0.96 | 0.006 |
1 OR = Odds Ratio, CI = Confidence Interval |
8.6.2 Prediction
Use svycontrast_design_df()
to get predicted log-odds from a svyglm()
object fit using family = quasibinomial()
and use ilogit()
to convert the log-odds to a probability.
Example 8.3 (continued): What is the predicted probability (and 95% CI) of lifetime marijuana use for someone who first used alcohol at age 13 years, is currently age 30 years, male, and has an income of < $20,000?
# Always include the intercept for prediction.
# Specify a 1 for the intercept, a # for each continuous predictor
# and a 1 for each non-reference level of a categorical variable.
# If a predictor is at its reference level, specify a 0 or exclude it.
ilogit(
svycontrast_design_df(fit.ex8.3,
c("(Intercept)" = 1,
"alc_agefirst" = 13,
"age_new26-34" = 1))
)
## est lower upper
## 1 0.8682 0.8554 0.8799
8.6.3 Interactions
Including an interaction in the model and estimating the effect of one variable at levels of another proceeds as described for weighted linear regression above with the addition of exponentiating to obtain AORs.
Example 8.3 (continued): After adjusting for age and income, does the association between lifetime marijuana use and age at first use of alcohol differ by sex? What are the AORs for age at first alcohol use for males and females?
3.int <- svyglm(mj_lifetime ~ alc_agefirst +
fit.ex8.+ demog_sex + demog_income +
age_new :demog_sex,
alc_agefirstfamily =quasibinomial(),
design = design.NSDUH.adults)
round(
cbind(
summary(fit.ex8.3.int,
df.resid=degf(fit.ex8.3.int$survey.design))$coef,
confint(fit.ex8.3.int,
ddf=degf(fit.ex8.3.int$survey.design))
)4) ,
## Estimate Std. Error t value Pr(>|t|) 2.5 % 97.5 %
## (Intercept) 4.4769 0.1719 26.048 0.0000 4.1317 4.8222
## alc_agefirst -0.2219 0.0096 -23.228 0.0000 -0.2411 -0.2027
## age_new26-34 0.2063 0.0457 4.511 0.0000 0.1145 0.2982
## age_new35-49 -0.1658 0.0429 -3.868 0.0003 -0.2519 -0.0797
## age_new50-64 -0.0941 0.0512 -1.838 0.0720 -0.1970 0.0087
## age_new65+ -0.7821 0.0565 -13.843 0.0000 -0.8955 -0.6686
## demog_sexFemale 0.6840 0.3188 2.145 0.0368 0.0436 1.3243
## demog_income$20,000 - $49,999 -0.1586 0.0359 -4.419 0.0001 -0.2308 -0.0865
## demog_income$50,000 - $74,999 -0.0846 0.0564 -1.500 0.1399 -0.1978 0.0287
## demog_income$75,000 or more -0.1303 0.0431 -3.021 0.0040 -0.2170 -0.0437
## alc_agefirst:demog_sexFemale -0.0432 0.0181 -2.380 0.0212 -0.0796 -0.0067
rbind(
"alc_agefirst @ Male" = svycontrast_design_df(fit.ex8.3.int,
c("alc_agefirst" = 1),
EXP = T), # Exponentiate to get OR
"alc_agefirst @ Female" = svycontrast_design_df(fit.ex8.3.int,
c("alc_agefirst" = 1,
"alc_agefirst:demog_sexFemale" = 1),
EXP = T)
)
## est lower upper
## alc_agefirst @ Male 0.8010 0.7858 0.8165
## alc_agefirst @ Female 0.7671 0.7461 0.7887