We begin by reading in the combined health outcomes data and the combined air pollution and UV radiation data to produce data frames for analysis at the state and county levels, in the process creating seasonal variables for the air pollution and UV data. We quickly noticed that some of our variables in the air pollution and UV data were strongly correlated with each other, so we omitted several for the purposes of our regression analysis (collinearity violates our assumptions); the final correlation matrix is given below.
library(tidyverse)
outcomes_state <- read_csv("data/lc_mel_asthma_state.csv")
outcomes_county <- read_csv("data/lc_mel_asthma_county.csv")
ap_uv <- read_csv("ap/ap_uv/apuv.csv")
#State-level analysis - fewer predictors
ap_uv_state_slim <- ap_uv %>%
select(-c(countyfips, county,
pm25_max_pred, pm25_mean_pred, pm25_pop_pred,
o3_max_pred, o3_mean_pred, o3_pop_pred,
i310, i305, i380, edr)) %>%
group_by(state, year, season) %>%
#Take medians across county for each season
summarize(across(pm25_med_pred:edd, median)) %>%
pivot_wider(names_from = season, values_from = pm25_med_pred:edd)
#State-level analysis - more predictors
ap_uv_state <- ap_uv %>%
select(-c(countyfips, county)) %>%
group_by(state, year, season) %>%
#Take medians across county for each season
summarize(across(pm25_max_pred:i380, median)) %>%
pivot_wider(names_from = season, values_from = pm25_max_pred:i380)
outcomes_wider_state <- outcomes_state %>%
pivot_wider(names_from = outcome, values_from = age_adjusted_incidence_rate)
state_analysis <- left_join(outcomes_wider_state, ap_uv_state_slim)
#Correlations
state_analysis_corr <- state_analysis %>%
select(`lung cancer`:edd_Winter) %>%
do(as.data.frame(cor(., method="pearson", use="pairwise.complete.obs"))) %>%
knitr::kable()
state_analysis_corr
lung cancer | melanoma | asthma | pm25_med_pred_Fall | pm25_med_pred_Spring | pm25_med_pred_Summer | pm25_med_pred_Winter | o3_med_pred_Fall | o3_med_pred_Spring | o3_med_pred_Summer | o3_med_pred_Winter | edd_Fall | edd_Spring | edd_Summer | edd_Winter | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
lung cancer | 1.0000000 | 0.2082952 | 0.3107250 | 0.8986288 | 0.8198802 | 0.7882240 | 0.7974809 | 0.1919442 | 0.3188281 | 0.7231695 | -0.7575209 | 0.4423832 | 0.3596362 | 0.7183017 | 0.1543232 |
melanoma | 0.2082952 | 1.0000000 | -0.7112906 | -0.1634813 | -0.1539077 | -0.3414310 | 0.1111093 | 0.2199124 | -0.0692780 | -0.2151543 | 0.1610076 | 0.5326605 | 0.3733857 | 0.1302414 | 0.3810257 |
asthma | 0.3107250 | -0.7112906 | 1.0000000 | 0.1324778 | 0.1347419 | 0.3681750 | -0.1162262 | -0.4738109 | 0.2878737 | 0.0941785 | -0.0887587 | -0.6790830 | -0.5864353 | -0.3469688 | -0.4382772 |
pm25_med_pred_Fall | 0.8986288 | -0.1634813 | 0.1324778 | 1.0000000 | 0.8990588 | 0.8582926 | 0.7793254 | 0.2840132 | 0.1924560 | 0.7889453 | -0.8460845 | 0.3921520 | 0.3250380 | 0.7189860 | 0.1022595 |
pm25_med_pred_Spring | 0.8198802 | -0.1539077 | 0.1347419 | 0.8990588 | 1.0000000 | 0.8303884 | 0.7592112 | 0.2658257 | 0.2314727 | 0.7210006 | -0.8427712 | 0.2484149 | 0.4710869 | 0.7013698 | 0.0388916 |
pm25_med_pred_Summer | 0.7882240 | -0.3414310 | 0.3681750 | 0.8582926 | 0.8303884 | 1.0000000 | 0.6984350 | 0.1996813 | 0.2962791 | 0.8374731 | -0.7890281 | 0.1961149 | 0.1081415 | 0.6464591 | 0.0891841 |
pm25_med_pred_Winter | 0.7974809 | 0.1111093 | -0.1162262 | 0.7793254 | 0.7592112 | 0.6984350 | 1.0000000 | 0.4312721 | 0.1888207 | 0.6572143 | -0.7372041 | 0.6325734 | 0.2466810 | 0.7032967 | 0.1150974 |
o3_med_pred_Fall | 0.1919442 | 0.2199124 | -0.4738109 | 0.2840132 | 0.2658257 | 0.1996813 | 0.4312721 | 1.0000000 | 0.0645681 | 0.3550059 | -0.3152226 | 0.7209518 | 0.1174262 | 0.5393190 | -0.0883089 |
o3_med_pred_Spring | 0.3188281 | -0.0692780 | 0.2878737 | 0.1924560 | 0.2314727 | 0.2962791 | 0.1888207 | 0.0645681 | 1.0000000 | 0.2788961 | -0.2219670 | 0.1439151 | 0.4159598 | 0.2821319 | 0.1649401 |
o3_med_pred_Summer | 0.7231695 | -0.2151543 | 0.0941785 | 0.7889453 | 0.7210006 | 0.8374731 | 0.6572143 | 0.3550059 | 0.2788961 | 1.0000000 | -0.7726044 | 0.3388034 | 0.3831358 | 0.8360440 | 0.3144834 |
o3_med_pred_Winter | -0.7575209 | 0.1610076 | -0.0887587 | -0.8460845 | -0.8427712 | -0.7890281 | -0.7372041 | -0.3152226 | -0.2219670 | -0.7726044 | 1.0000000 | -0.2692265 | -0.3273758 | -0.6197914 | -0.1665205 |
edd_Fall | 0.4423832 | 0.5326605 | -0.6790830 | 0.3921520 | 0.2484149 | 0.1961149 | 0.6325734 | 0.7209518 | 0.1439151 | 0.3388034 | -0.2692265 | 1.0000000 | 0.2757208 | 0.4563821 | 0.1890074 |
edd_Spring | 0.3596362 | 0.3733857 | -0.5864353 | 0.3250380 | 0.4710869 | 0.1081415 | 0.2466810 | 0.1174262 | 0.4159598 | 0.3831358 | -0.3273758 | 0.2757208 | 1.0000000 | 0.3841069 | 0.4020161 |
edd_Summer | 0.7183017 | 0.1302414 | -0.3469688 | 0.7189860 | 0.7013698 | 0.6464591 | 0.7032967 | 0.5393190 | 0.2821319 | 0.8360440 | -0.6197914 | 0.4563821 | 0.3841069 | 1.0000000 | 0.0915974 |
edd_Winter | 0.1543232 | 0.3810257 | -0.4382772 | 0.1022595 | 0.0388916 | 0.0891841 | 0.1150974 | -0.0883089 | 0.1649401 | 0.3144834 | -0.1665205 | 0.1890074 | 0.4020161 | 0.0915974 | 1.0000000 |
Before performing our analysis, we may want to look at some bivariate plots with trends over time to see if there are general trends from 2001 to 2015.
library(GGally)
state_analysis %>%
filter(year %in% 2001:2015) %>%
ggpairs(columns = c("year","lung cancer", "o3_med_pred_Summer", "pm25_med_pred_Summer"),
mapping = aes(group = state, color = state),
columnLabels = c("Year", "Lung Cancer/100,000","Median summer O3, ppm", "PM 2.5, ug/m^3"))
state_plot <- state_analysis %>%
ggplot(aes(x = year, y = `lung cancer`, group = state, color = state)) +
geom_path()
spring_plot <- state_analysis %>%
ggplot(aes(x = o3_med_pred_Spring, y = `lung cancer`, group = state, color = state)) +
geom_point()
summer_plot <- state_analysis %>%
ggplot(aes(x = o3_med_pred_Summer, y = `lung cancer`, group = state, color = state)) +
geom_point()
fall_plot <- state_analysis %>%
ggplot(aes(x = o3_med_pred_Fall, y = `lung cancer`, group = state, color = state)) +
geom_point()
library(patchwork)
(state_plot + spring_plot) / (summer_plot + fall_plot)
We construct a model using the air quality variables (ozone, particulate matter), state, and powers and interactions thereof, then pare down with BIC, limiting our focus to the years 2001-2015.
state_2001_2015 <- state_analysis %>%
filter(year %in% 2001:2015) %>%
#exclude asthma, lots of missingness
select(-asthma)
aq_lung_df <- state_2001_2015 %>%
select(-c(melanoma, starts_with("edd")))
#fit_aq <- lm(`lung cancer` ~ .^2, data = aq_lung_df) - this broke in R
fit_aq <- lm(`lung cancer` ~ . +
year*(o3_med_pred_Spring +
o3_med_pred_Summer +
o3_med_pred_Fall) +
year*(pm25_med_pred_Spring +
pm25_med_pred_Summer +
pm25_med_pred_Fall) +
state*year, data = aq_lung_df)
step_bic_aq <- step(fit_aq, trace = 0, k = log(nobs(fit_aq)), direction = "backward")
summary(step_bic_aq)
##
## Call:
## lm(formula = `lung cancer` ~ state + year + pm25_med_pred_Fall +
## pm25_med_pred_Spring + o3_med_pred_Summer + year:pm25_med_pred_Fall +
## state:year, data = aq_lung_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.75773 -0.51795 -0.00639 0.51870 1.64145
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3402.03891 485.87390 7.002 3.80e-08 ***
## stateOH 1209.53035 266.45723 4.539 6.40e-05 ***
## statePA 543.09187 196.19321 2.768 0.00895 **
## year -1.65961 0.24187 -6.862 5.77e-08 ***
## pm25_med_pred_Fall -299.22919 67.68656 -4.421 9.10e-05 ***
## pm25_med_pred_Spring -0.65867 0.19252 -3.421 0.00160 **
## o3_med_pred_Summer -0.10565 0.04746 -2.226 0.03254 *
## year:pm25_med_pred_Fall 0.14930 0.03371 4.430 8.86e-05 ***
## stateOH:year -0.59658 0.13271 -4.495 7.29e-05 ***
## statePA:year -0.26793 0.09771 -2.742 0.00956 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9403 on 35 degrees of freedom
## Multiple R-squared: 0.9738, Adjusted R-squared: 0.9671
## F-statistic: 144.6 on 9 and 35 DF, p-value: < 2.2e-16
Am I reading that right? \(R^2\) of 0.97! That sounds too good to be true. Let’s look at some diagnostic plots.
lung_fit <- aq_lung_df %>%
modelr::add_residuals(step_bic_aq) %>%
modelr::add_predictions(step_bic_aq)
lung_fit %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
These residuals look great! Centered around 0 in a relatively even band. How about normality?
plot(step_bic_aq, which = 2)
That’s not ideal. I don’t love Box-Cox, but I wonder if it might give us some insight.
MASS::boxcox(step_bic_aq)
Really doesn’t help us; the best power of our response variable is about 1. Are there any influential points/high-leverage points?
plot(step_bic_aq, which = 5)
No datapoint with a Cook’s distance greater than 0.5, that’s good. Are we overfit?
What happens if we cross-validate against simpler models? We will use leave-one-out and Monte Carlo cross-validation.
library(modelr)
set.seed(15)
cv_df <- crossv_loo(aq_lung_df)
cv_df <- cv_df %>%
mutate(
#Ozone only
ozone = map(train, ~lm(`lung cancer` ~ o3_med_pred_Spring + o3_med_pred_Summer +
o3_med_pred_Fall, data = .x)),
#Ozone with year and state
ozone_state_year = map(train, ~lm(`lung cancer` ~ o3_med_pred_Spring + o3_med_pred_Summer +
o3_med_pred_Fall + year + state, data = .x)),
#Ozone with year and state with time interactions
ozone_state_year2 = map(train, ~lm(`lung cancer` ~ year*(o3_med_pred_Spring +
o3_med_pred_Summer +
o3_med_pred_Fall) + year + state +
year*state, data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_aq, data = .x)) %>%
mutate(
rmse_ozone = map2_dbl(ozone, test, ~rmse(model = .x, data = .y)),
rmse_ozone_state_year = map2_dbl(ozone_state_year, test, ~rmse(model = .x, data = .y)),
rmse_ozone_state_year2 = map2_dbl(ozone_state_year2, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
Nice! Our BIC fit does the best on leave-one-out. What about Monte Carlo (1000 simulations, holdout 20%)?
set.seed(15)
cv_df <- crossv_mc(aq_lung_df, n = 1000)
cv_df <- cv_df %>%
mutate(
#Ozone only
ozone = map(train, ~lm(`lung cancer` ~ o3_med_pred_Spring + o3_med_pred_Summer +
o3_med_pred_Fall, data = .x)),
#Ozone with year and state
ozone_state_year = map(train, ~lm(`lung cancer` ~ o3_med_pred_Spring + o3_med_pred_Summer +
o3_med_pred_Fall + year + state, data = .x)),
#Ozone with year and state with time interactions
ozone_state_year2 = map(train, ~lm(`lung cancer` ~ year*(o3_med_pred_Spring +
o3_med_pred_Summer +
o3_med_pred_Fall) + year + state +
year*state, data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_aq, data = .x)) %>%
mutate(
rmse_ozone = map2_dbl(ozone, test, ~rmse(model = .x, data = .y)),
rmse_ozone_state_year = map2_dbl(ozone_state_year, test, ~rmse(model = .x, data = .y)),
rmse_ozone_state_year2 = map2_dbl(ozone_state_year2, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
Our BIC fit wins again. But there’s still something fishy. First, this model has 10 terms and just 45 datapoints - it should almost certainly be overfit, but it still cross-validates well. Second, the \(R^2\) for this model is 0.97 - how can that be when it doesn’t include smoking, which is probably the main determinant of lung cancer? Which variables in this model are the most important?
broom::tidy(step_bic_aq) %>%
mutate(
abs_t = abs(statistic),
term = as.factor(term),
term = fct_reorder(term, abs_t)
) %>%
ggplot(aes(x = abs_t, y = term)) +
geom_col() +
labs(title = "Relative Importance of Model Terms",
y = "Model Term", x = "|t| statistic")
As before, we create several bivariate plots before constructing our model.
state_analysis %>%
filter(year %in% 2001:2015) %>%
ggpairs(columns = c("year","melanoma","edd_Spring","edd_Summer"),
mapping = aes(group = state, color = state),
columnLabels = c("Year", "Melanoma/100,000", "Spring EDD (J/m^2)", "Summer EDD (J/m^2)"))
That’s weird. There seem to be negative correlations between EDD and melanoma when we break down by state, but positive correlations overall—an instance of Simpson’s paradox.
How well does UV radiation predict melanoma? Let’s find out. This time we limit our analysis to 2005-2015.
uv_df <- state_2001_2015 %>%
filter(year >= 2005) %>%
select(-c(starts_with("pm25"), starts_with("o3"), `lung cancer`))
mel_fit <- lm(melanoma ~ .^2, data = uv_df)
step_bic_mel <- step(mel_fit, trace = 0, k = log(nobs(mel_fit)), direction = "backward")
summary(step_bic_mel)
##
## Call:
## lm(formula = melanoma ~ state + year + edd_Fall + edd_Spring +
## edd_Summer + edd_Winter + state:year + state:edd_Fall + state:edd_Spring +
## state:edd_Summer + year:edd_Fall + year:edd_Spring + year:edd_Winter +
## edd_Fall:edd_Spring + edd_Fall:edd_Summer + edd_Fall:edd_Winter +
## edd_Spring:edd_Summer, data = uv_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00663 -0.22621 -0.07181 0.16824 1.02878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.906e+03 3.716e+03 -1.051 0.31794
## stateOH 6.941e+01 6.487e+02 0.107 0.91690
## statePA -1.975e+03 5.958e+02 -3.314 0.00783 **
## year 1.675e+00 1.774e+00 0.944 0.36745
## edd_Fall 4.838e+00 2.199e+00 2.200 0.05244 .
## edd_Spring -3.220e+00 1.205e+00 -2.673 0.02338 *
## edd_Summer 1.609e-01 6.231e-02 2.582 0.02731 *
## edd_Winter 1.139e+01 5.538e+00 2.056 0.06683 .
## stateOH:year -1.484e-01 3.204e-01 -0.463 0.65316
## statePA:year 9.299e-01 2.881e-01 3.228 0.00905 **
## stateOH:edd_Fall 3.296e-02 1.307e-02 2.522 0.03029 *
## statePA:edd_Fall 1.259e-02 5.626e-03 2.238 0.04917 *
## stateOH:edd_Spring 1.704e-02 9.479e-03 1.797 0.10250
## statePA:edd_Spring 2.490e-03 4.707e-03 0.529 0.60834
## stateOH:edd_Summer 3.830e-02 1.258e-02 3.045 0.01235 *
## statePA:edd_Summer 2.369e-02 8.521e-03 2.780 0.01945 *
## year:edd_Fall -2.257e-03 1.051e-03 -2.148 0.05722 .
## year:edd_Spring 1.645e-03 5.939e-04 2.769 0.01981 *
## year:edd_Winter -5.636e-03 2.758e-03 -2.044 0.06825 .
## edd_Fall:edd_Spring 2.190e-05 1.333e-05 1.643 0.13139
## edd_Fall:edd_Summer -8.768e-05 3.309e-05 -2.650 0.02433 *
## edd_Fall:edd_Winter -5.589e-05 3.489e-05 -1.602 0.14020
## edd_Spring:edd_Summer -3.116e-05 1.509e-05 -2.065 0.06583 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.886 on 10 degrees of freedom
## Multiple R-squared: 0.9643, Adjusted R-squared: 0.8858
## F-statistic: 12.28 on 22 and 10 DF, p-value: 0.0001287
This model is huge, it’s almost certainly overfit. I’m sure the diagnostics will be terrible.
mel_df <- uv_df %>%
modelr::add_residuals(step_bic_mel) %>%
modelr::add_predictions(step_bic_mel)
mel_df %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
Residuals suggest a bit of heteroscedasticity and curvature but not as bad as I imagined. How about normality?
plot(step_bic_mel, which = 2)
Definitely not normal. High leverage points?
plot(step_bic_mel, which = 5)
Several high leverage points. I think we want a different model. Lets’ try a simple one without interactions.
mel_fit2 <- lm(melanoma ~ ., data = uv_df)
summary(mel_fit2)
##
## Call:
## lm(formula = melanoma ~ ., data = uv_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0445 -1.0876 -0.1930 0.7735 2.6940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.639e+02 1.916e+02 -5.031 3.44e-05 ***
## stateOH 6.496e+00 1.633e+00 3.978 0.000524 ***
## statePA 5.271e+00 1.186e+00 4.446 0.000157 ***
## year 5.008e-01 9.507e-02 5.268 1.86e-05 ***
## edd_Fall -2.872e-03 2.427e-03 -1.183 0.247777
## edd_Spring -1.870e-03 1.459e-03 -1.282 0.211710
## edd_Summer -3.911e-03 2.221e-03 -1.761 0.090482 .
## edd_Winter -6.473e-03 5.996e-03 -1.080 0.290657
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.488 on 25 degrees of freedom
## Multiple R-squared: 0.7484, Adjusted R-squared: 0.6779
## F-statistic: 10.62 on 7 and 25 DF, p-value: 3.837e-06
This comes at a big cost to explanatory power. Notably, the EDD variables aren’t significant. How do the residuals look?
mel_df2 <- uv_df %>%
modelr::add_residuals(mel_fit2) %>%
modelr::add_predictions(mel_fit2)
mel_df2 %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
We have curvature in the residuals. What is the source?
summer_resid <- mel_df2 %>%
ggplot(aes(x = edd_Summer, y = resid)) + geom_point() + labs(x = "Summer Median EDD", y = "Residual")
year_resid <- mel_df2 %>%
ggplot(aes(x = year, y = resid)) + geom_point() + labs(x = "Year", y = "Residual")
fall_resid <- mel_df2 %>%
ggplot(aes(x = edd_Fall, y = resid)) + geom_point() + labs(x = "Fall Median EDD", y = "Residual")
state_resid <- mel_df2 %>%
ggplot(aes(x = state, y = resid)) + geom_violin() + labs(x = "State", y = "Residual")
(summer_resid + year_resid) / (fall_resid + state_resid)
Seems like fall EDD and year might be the issues. Try transforming.
mel_fit3 <- lm(melanoma ~ state + I(1/year^3) +
edd_Spring + edd_Summer + edd_Fall + edd_Winter,
data = uv_df)
summary(mel_fit3)
##
## Call:
## lm(formula = melanoma ~ state + I(1/year^3) + edd_Spring + edd_Summer +
## edd_Fall + edd_Winter, data = uv_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0451 -1.0908 -0.1952 0.7702 2.7017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.778e+02 6.453e+01 5.855 4.16e-06 ***
## stateOH 6.492e+00 1.635e+00 3.970 0.000535 ***
## statePA 5.268e+00 1.187e+00 4.437 0.000160 ***
## I(1/year^3) -2.721e+12 5.179e+11 -5.254 1.94e-05 ***
## edd_Spring -1.866e-03 1.461e-03 -1.277 0.213248
## edd_Summer -3.916e-03 2.224e-03 -1.760 0.090576 .
## edd_Fall -2.862e-03 2.430e-03 -1.178 0.249980
## edd_Winter -6.458e-03 6.005e-03 -1.075 0.292442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.49 on 25 degrees of freedom
## Multiple R-squared: 0.7476, Adjusted R-squared: 0.677
## F-statistic: 10.58 on 7 and 25 DF, p-value: 3.97e-06
mel_df3 <- uv_df %>%
modelr::add_residuals(mel_fit3) %>%
modelr::add_predictions(mel_fit3)
mel_df3 %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
Not much better. How are the other diagnostics?
plot(mel_fit2, which = 2)
Not very normal…
plot(mel_fit2, which = 5)
No high-leverage points. Are we overfit? We only have 33 rows.
set.seed(15)
cv_df <- crossv_loo(uv_df)
cv_df <- cv_df %>%
mutate(
#State and year only
state_year = map(train, ~lm(melanoma ~ state + year, data = .x)),
#EDD only
edd = map(train, ~lm(melanoma ~ edd_Winter + edd_Spring + edd_Summer + edd_Fall, data = .x)),
#EDD with year and state, linear
edd_state_year = map(train, ~mel_fit2, data = .x),
#EDD with year and state with time interactions
edd_state_year2 = map(train, ~lm(melanoma ~ year*(edd_Winter +
edd_Spring +
edd_Summer +
edd_Fall) + year + state +
year*state, data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_mel, data = .x)) %>%
mutate(
rmse_state_year = map2_dbl(state_year, test, ~rmse(model = .x, data = .y)),
rmse_edd = map2_dbl(edd, test, ~rmse(model = .x, data = .y)),
rmse_edd_state_year = map2_dbl(edd_state_year, test, ~rmse(model = .x, data = .y)),
rmse_edd_state_year2 = map2_dbl(edd_state_year2, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
The BIC fit wins? How? Will it survive 1000 Monte Carlo simulations?
set.seed(15)
cv_df <- crossv_mc(uv_df, 1000)
cv_df <- cv_df %>%
mutate(
#State and year only
state_year = map(train, ~lm(melanoma ~ state + year, data = .x)),
#EDD only
edd = map(train, ~lm(melanoma ~ edd_Winter + edd_Spring + edd_Summer + edd_Fall, data = .x)),
#EDD with year and state, linear
edd_state_year = map(train, ~mel_fit2, data = .x),
#EDD with year and state with time interactions
edd_state_year2 = map(train, ~lm(melanoma ~ year*(edd_Winter +
edd_Spring +
edd_Summer +
edd_Fall) + year + state +
year*state, data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_mel, data = .x)) %>%
mutate(
rmse_state_year = map2_dbl(state_year, test, ~rmse(model = .x, data = .y)),
rmse_edd = map2_dbl(edd, test, ~rmse(model = .x, data = .y)),
rmse_edd_state_year = map2_dbl(edd_state_year, test, ~rmse(model = .x, data = .y)),
rmse_edd_state_year2 = map2_dbl(edd_state_year2, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
That model must be disgustingly overfit… right? How important is each variable?
broom::tidy(step_bic_mel) %>%
mutate(
abs_t = abs(statistic),
term = as.factor(term),
term = fct_reorder(term, abs_t)
) %>%
ggplot(aes(x = abs_t, y = term)) +
geom_col() +
labs(title = "Relative Importance of Model Terms",
y = "Model Term", x = "|t| statistic")
However, we see a different pattern of importance from the model with just linear terms:
broom::tidy(mel_fit2) %>%
mutate(
abs_t = abs(statistic),
term = as.factor(term),
term = fct_reorder(term, abs_t)
) %>%
ggplot(aes(x = abs_t, y = term)) +
geom_col() +
labs(title = "Relative Importance of Model Terms",
y = "Model Term", x = "|t| statistic")
Time is a significant predictor in many of the above models. Will we still have good models without it? In exchange, we get a lot more data - resolution at the county level instead of the state level.
We create our county-level dataframe:
outcomes_wider_county <- outcomes_county %>%
pivot_wider(names_from = outcome, values_from = age_adjusted_incidence_rate)
#county-level analysis - fewer predictors
ap_uv_county_slim <- ap_uv %>%
filter(state %in% c("NY","OH","PA")) %>%
select(-c(pm25_max_pred, pm25_mean_pred, pm25_pop_pred,
o3_max_pred, o3_mean_pred, o3_pop_pred, i324,
i310, i305, i380, edr)) %>%
group_by(county, state, season) %>%
#Take medians across years for each season
summarize(across(pm25_med_pred:edd, median, na.rm = TRUE)) %>%
pivot_wider(names_from = season, values_from = pm25_med_pred:edd)
county_df <- right_join(outcomes_wider_county, ap_uv_county_slim)
Again, we look at correlations among the predictors:
county_corr <- county_df %>%
select(pm25_med_pred_Fall:edd_Winter) %>%
do(as.data.frame(cor(., method="pearson", use="pairwise.complete.obs"))) %>%
knitr::kable()
county_corr
pm25_med_pred_Fall | pm25_med_pred_Spring | pm25_med_pred_Summer | pm25_med_pred_Winter | o3_med_pred_Fall | o3_med_pred_Spring | o3_med_pred_Summer | o3_med_pred_Winter | edd_Fall | edd_Spring | edd_Summer | edd_Winter | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
pm25_med_pred_Fall | 1.0000000 | 0.9638443 | 0.8616069 | 0.9052818 | 0.2291057 | 0.0146364 | 0.8384443 | -0.6956225 | 0.7684484 | 0.8022451 | 0.8870468 | 0.2349939 |
pm25_med_pred_Spring | 0.9638443 | 1.0000000 | 0.9158741 | 0.9028251 | 0.2809823 | 0.0353524 | 0.8916228 | -0.6912378 | 0.7924249 | 0.8272669 | 0.8918020 | 0.3220468 |
pm25_med_pred_Summer | 0.8616069 | 0.9158741 | 1.0000000 | 0.8265179 | 0.2529572 | 0.0051948 | 0.8895923 | -0.6914356 | 0.7773812 | 0.8104157 | 0.7624534 | 0.4960550 |
pm25_med_pred_Winter | 0.9052818 | 0.9028251 | 0.8265179 | 1.0000000 | -0.0004523 | -0.1934251 | 0.8279097 | -0.7931456 | 0.6843229 | 0.7790970 | 0.7957184 | 0.3151852 |
o3_med_pred_Fall | 0.2291057 | 0.2809823 | 0.2529572 | -0.0004523 | 1.0000000 | 0.8829294 | 0.3808405 | 0.3893814 | 0.3971814 | 0.2426730 | 0.3636476 | 0.0583694 |
o3_med_pred_Spring | 0.0146364 | 0.0353524 | 0.0051948 | -0.1934251 | 0.8829294 | 1.0000000 | 0.1154320 | 0.6116594 | 0.1717104 | 0.0137604 | 0.1016028 | -0.0893092 |
o3_med_pred_Summer | 0.8384443 | 0.8916228 | 0.8895923 | 0.8279097 | 0.3808405 | 0.1154320 | 1.0000000 | -0.6201207 | 0.8356498 | 0.8538449 | 0.8446666 | 0.4939809 |
o3_med_pred_Winter | -0.6956225 | -0.6912378 | -0.6914356 | -0.7931456 | 0.3893814 | 0.6116594 | -0.6201207 | 1.0000000 | -0.5442261 | -0.6571396 | -0.5928370 | -0.3705689 |
edd_Fall | 0.7684484 | 0.7924249 | 0.7773812 | 0.6843229 | 0.3971814 | 0.1717104 | 0.8356498 | -0.5442261 | 1.0000000 | 0.9045582 | 0.8531265 | 0.6652107 |
edd_Spring | 0.8022451 | 0.8272669 | 0.8104157 | 0.7790970 | 0.2426730 | 0.0137604 | 0.8538449 | -0.6571396 | 0.9045582 | 1.0000000 | 0.8444612 | 0.6335905 |
edd_Summer | 0.8870468 | 0.8918020 | 0.7624534 | 0.7957184 | 0.3636476 | 0.1016028 | 0.8446666 | -0.5928370 | 0.8531265 | 0.8444612 | 1.0000000 | 0.3542735 |
edd_Winter | 0.2349939 | 0.3220468 | 0.4960550 | 0.3151852 | 0.0583694 | -0.0893092 | 0.4939809 | -0.3705689 | 0.6652107 | 0.6335905 | 0.3542735 | 1.0000000 |
We average the particulate matter because the seasonal variables are highly correlated.
county_df2 <- county_df %>%
mutate(pm25_med_pred = rowMeans(select(., starts_with("pm25")))) %>%
select(-c(pm25_med_pred_Fall:pm25_med_pred_Winter))
Now we can start the cross-sectional analysis.
county_df2 %>%
ggpairs(columns = c("lung cancer", "o3_med_pred_Summer", "o3_med_pred_Fall", "pm25_med_pred"),
mapping = aes(group = state, color = state),
columnLabels = c("Lung Cancer/100,000","Median summer O3, ppm", "Median fall O3, ppm",
"PM 2.5, ug/m^3"))
Some of these correlations seem quite flat, while others rather strong, and these trends can reverse when we break down by state.
Time for another model. Once more we try to predict lung cancer as a function of air quality variables.
aq_lung_df2 <- county_df2 %>%
select(-c(melanoma, asthma, starts_with("edd"), county, fips))
fit_aq_cty <- lm(`lung cancer` ~ .^2, data = aq_lung_df2)
step_bic_aq_cty <- step(fit_aq_cty, trace = 0, k = log(nobs(fit_aq)), direction = "backward")
summary(step_bic_aq_cty)
##
## Call:
## lm(formula = `lung cancer` ~ state + o3_med_pred_Fall + o3_med_pred_Spring +
## o3_med_pred_Winter + state:o3_med_pred_Fall + state:o3_med_pred_Spring +
## o3_med_pred_Fall:o3_med_pred_Winter + o3_med_pred_Spring:o3_med_pred_Winter,
## data = aq_lung_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.2067 -6.0740 -0.0349 6.4618 25.7443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -479.6378 191.3894 -2.506 0.012989 *
## stateOH 4.8048 57.9248 0.083 0.933974
## statePA 157.7313 58.0866 2.715 0.007186 **
## o3_med_pred_Fall -34.9097 15.5973 -2.238 0.026290 *
## o3_med_pred_Spring 36.9504 14.4287 2.561 0.011163 *
## o3_med_pred_Winter 16.6532 7.2771 2.288 0.023137 *
## stateOH:o3_med_pred_Fall 13.1302 2.6875 4.886 2.08e-06 ***
## statePA:o3_med_pred_Fall 7.8972 2.7127 2.911 0.004000 **
## stateOH:o3_med_pred_Spring -9.5845 2.7983 -3.425 0.000743 ***
## statePA:o3_med_pred_Spring -9.3894 2.6316 -3.568 0.000448 ***
## o3_med_pred_Fall:o3_med_pred_Winter 1.0163 0.5109 1.989 0.047983 *
## o3_med_pred_Spring:o3_med_pred_Winter -1.0880 0.4819 -2.258 0.025020 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.595 on 204 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4063, Adjusted R-squared: 0.3743
## F-statistic: 12.69 on 11 and 204 DF, p-value: < 2.2e-16
\(R^2 = 0.37\) is much more reasonable than what we were seeing before. Interestingly this model doesn’t have any particulate matter terms. How do the diagnostics look?
lung_fit_cty <- aq_lung_df2 %>%
modelr::add_residuals(step_bic_aq_cty) %>%
modelr::add_predictions(step_bic_aq_cty)
lung_fit_cty %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
We have some heteroscedasticity, but otherwise beautiful residuals. How about normality?
plot(step_bic_aq_cty, which = 2)
Not perfect, but a lot better than the state-level model. High-leverage points?
plot(step_bic_aq_cty, which = 5)
No high-leverage points.
Are we overfit? We have 217 rows. Use Monte Carlo cross-validation against simpler models.
set.seed(15)
cv_df <- crossv_mc(aq_lung_df2, 1000)
cv_df <- cv_df %>%
mutate(
#PM only
pm = map(train, ~lm(`lung cancer` ~ pm25_med_pred, data = .x)),
#O3 only
o3 = map(train, ~lm(`lung cancer` ~ o3_med_pred_Fall +
o3_med_pred_Winter +
o3_med_pred_Spring +
o3_med_pred_Summer, data = .x)),
#O3 and PM
o3_pm = map(train, ~lm(`lung cancer` ~ o3_med_pred_Fall +
o3_med_pred_Winter +
o3_med_pred_Spring +
o3_med_pred_Summer +
pm25_med_pred, data = .x)),
#O3, PM, and State
o3_pm_state = map(train, ~lm(`lung cancer` ~ ., data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_aq_cty, data = .x)) %>%
mutate(
rmse_pm25 = map2_dbl(pm, test, ~rmse(model = .x, data = .y)),
rmse_o3 = map2_dbl(o3, test, ~rmse(model = .x, data = .y)),
rmse_o3_pm25 = map2_dbl(o3_pm, test, ~rmse(model = .x, data = .y)),
rmse_o3_pm25_state = map2_dbl(o3_pm_state, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
Once again the BIC model wins. But we actually shouldn’t be upset about it this time. Our model has 11 terms and 217 rows—it’s not like we only have 45 rows for that many predictors. Maybe it really isn’t overfit! How important are the predictors relative to one another?
broom::tidy(step_bic_aq_cty) %>%
mutate(
abs_t = abs(statistic),
term = as.factor(term),
term = fct_reorder(term, abs_t)
) %>%
ggplot(aes(x = abs_t, y = term)) +
geom_col() +
labs(title = "Relative Importance of Model Terms",
y = "Model Term", x = "|t| statistic")
The strongest terms in this model are the state-by-ozone interactions. I wonder how much policy and industry differ between the two states.
In the following analysis, we will only use those rows for which we have complete data (191), as there is some missingness in the response variable.
asth_df <- county_df2 %>%
filter(!is.na(asthma))
asth_df %>%
ggpairs(columns = c("asthma", "o3_med_pred_Summer", "o3_med_pred_Fall", "pm25_med_pred"),
mapping = aes(group = state, color = state),
columnLabels = c("Asthma/100,000","Median summer O3, ppm", "Median fall O3, ppm",
"PM 2.5, ug/m^3"))
None of these exposures seem very strongly correlated with the outcome, though summer ozone correlates pretty strongly with particulate matter. Also, we may have some outliers.
aq_asth_df <- asth_df %>%
select(-c(melanoma, `lung cancer`, starts_with("edd"), county, fips))
fit_asth_cty <- lm(asthma ~ .^2, data = aq_asth_df)
step_bic_asth_cty <- step(fit_asth_cty, trace = 0, k = log(nobs(fit_aq)), direction = "backward")
summary(step_bic_asth_cty)
##
## Call:
## lm(formula = asthma ~ state + o3_med_pred_Fall + o3_med_pred_Spring +
## state:o3_med_pred_Fall + o3_med_pred_Fall:o3_med_pred_Spring,
## data = aq_asth_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.202 -12.168 -1.760 8.629 244.506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3915.2387 613.0669 6.386 1.34e-09 ***
## stateOH 62.9317 138.5573 0.454 0.65022
## statePA 391.3805 146.8512 2.665 0.00838 **
## o3_med_pred_Fall -115.7822 21.1637 -5.471 1.44e-07 ***
## o3_med_pred_Spring -86.7224 15.5627 -5.572 8.77e-08 ***
## stateOH:o3_med_pred_Fall -2.6666 4.4104 -0.605 0.54617
## statePA:o3_med_pred_Fall -12.6007 4.6982 -2.682 0.00798 **
## o3_med_pred_Fall:o3_med_pred_Spring 2.6020 0.5183 5.020 1.21e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.13 on 185 degrees of freedom
## Multiple R-squared: 0.597, Adjusted R-squared: 0.5817
## F-statistic: 39.15 on 7 and 185 DF, p-value: < 2.2e-16
Strong \(R^2\) value but not absurd. Weird that fall ozone is protective (and strongly so). How do the residuals look?
asth_cty <- aq_asth_df %>%
modelr::add_residuals(step_bic_asth_cty) %>%
modelr::add_predictions(step_bic_asth_cty)
asth_cty %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
Holy outliers, Batman!
plot(step_bic_asth_cty, which = 5)
We should filter out those high-leverage points and refit.
aq_asth_df2 <- aq_asth_df %>%
slice(-c(3,31,185))
fit_asth_cty <- lm(asthma ~ .^2, data = aq_asth_df2)
step_bic_asth_cty <- step(fit_asth_cty, trace = 0, k = log(nobs(fit_aq)), direction = "backward")
summary(step_bic_asth_cty)
##
## Call:
## lm(formula = asthma ~ state + o3_med_pred_Fall + o3_med_pred_Spring +
## o3_med_pred_Winter + state:o3_med_pred_Fall + state:o3_med_pred_Winter,
## data = aq_asth_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.875 -11.021 -0.954 8.533 63.721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 523.0156 56.8882 9.194 < 2e-16 ***
## stateOH 42.6557 76.5167 0.557 0.577899
## statePA -82.5309 94.0678 -0.877 0.381463
## o3_med_pred_Fall -0.6742 2.6691 -0.253 0.800855
## o3_med_pred_Spring -6.7508 2.2294 -3.028 0.002823 **
## o3_med_pred_Winter -4.5638 1.6210 -2.815 0.005414 **
## stateOH:o3_med_pred_Fall 11.9977 3.5559 3.374 0.000907 ***
## statePA:o3_med_pred_Fall -4.6986 3.8175 -1.231 0.220011
## stateOH:o3_med_pred_Winter -16.8004 4.3672 -3.847 0.000166 ***
## statePA:o3_med_pred_Winter 7.2681 2.4625 2.951 0.003584 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.25 on 180 degrees of freedom
## Multiple R-squared: 0.6123, Adjusted R-squared: 0.5929
## F-statistic: 31.59 on 9 and 180 DF, p-value: < 2.2e-16
What does it look like now?
asth_cty <- aq_asth_df2 %>%
modelr::add_residuals(step_bic_asth_cty) %>%
modelr::add_predictions(step_bic_asth_cty)
asth_cty %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
plot(step_bic_asth_cty, which = 2)
Outliers are gone… but we have serious abnormality. Tried to transform by taking the square root of the response, but it wasn’t better.
Sometimes a smaller model is better. Is this one of those times?
set.seed(15)
cv_df <- crossv_mc(aq_asth_df2, 1000)
cv_df <- cv_df %>%
mutate(
#PM only
pm = map(train, ~lm(asthma ~ pm25_med_pred, data = .x)),
#O3 only
o3 = map(train, ~lm(asthma ~ o3_med_pred_Fall +
o3_med_pred_Winter +
o3_med_pred_Spring +
o3_med_pred_Summer, data = .x)),
#O3 and PM
o3_pm = map(train, ~lm(asthma ~ o3_med_pred_Fall +
o3_med_pred_Winter +
o3_med_pred_Spring +
o3_med_pred_Summer +
pm25_med_pred, data = .x)),
#O3, PM, and State
o3_pm_state = map(train, ~lm(asthma ~ ., data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_asth_cty, data = .x)) %>%
mutate(
rmse_pm25 = map2_dbl(pm, test, ~rmse(model = .x, data = .y)),
rmse_o3 = map2_dbl(o3, test, ~rmse(model = .x, data = .y)),
rmse_o3_pm25 = map2_dbl(o3_pm, test, ~rmse(model = .x, data = .y)),
rmse_o3_pm25_state = map2_dbl(o3_pm_state, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
This is pretty weird. What terms are the most important, according to the BIC model?
broom::tidy(step_bic_asth_cty) %>%
mutate(
abs_t = abs(statistic),
term = as.factor(term),
term = fct_reorder(term, abs_t)
) %>%
ggplot(aes(x = abs_t, y = term)) +
geom_col() +
labs(title = "Relative Importance of Model Terms",
y = "Model Term", x = "|t| statistic")
State interactions with ozone again? This might be something to research.
county_df2 %>%
ggpairs(columns = c("melanoma", "edd_Summer", "edd_Fall", "edd_Winter", "edd_Spring"),
mapping = aes(group = state, color = state),
columnLabels = c("Melanoma/100,000","Median summer EDD", "Median fall EDD",
"Median winter EDD", "Median spring EDD"))
No season has a strong correlation with melanoma, but the seasons are moderately to strongly correlated with one another, spring and fall especially.
mel_cty <- county_df2 %>%
select(state, melanoma, starts_with("edd"))
fit_mel_cty <- lm(melanoma ~ .^2 +
I(edd_Spring^3) +
I(edd_Summer^3) +
I(edd_Fall^3) +
I(edd_Winter^3), data = mel_cty)
step_bic_mel_cty <- step(fit_mel_cty, trace = 0, k = log(nobs(fit_aq)), direction = "backward")
summary(step_bic_mel_cty)
##
## Call:
## lm(formula = melanoma ~ state + edd_Spring + edd_Summer + edd_Winter +
## state:edd_Winter + edd_Spring:edd_Winter + edd_Summer:edd_Winter,
## data = mel_cty)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.7539 -3.3520 0.1517 3.4794 16.4805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.642e+01 9.886e+01 0.874 0.383052
## stateOH -2.945e+01 1.061e+01 -2.775 0.006047 **
## statePA -4.457e+01 7.531e+00 -5.919 1.37e-08 ***
## edd_Spring 8.822e-02 2.279e-02 3.871 0.000146 ***
## edd_Summer -6.128e-02 2.947e-02 -2.079 0.038860 *
## edd_Winter -2.670e-01 2.167e-01 -1.232 0.219272
## stateOH:edd_Winter 5.758e-02 2.258e-02 2.550 0.011523 *
## statePA:edd_Winter 9.511e-02 1.552e-02 6.129 4.56e-09 ***
## edd_Spring:edd_Winter -1.800e-04 4.913e-05 -3.663 0.000318 ***
## edd_Summer:edd_Winter 1.600e-04 6.446e-05 2.482 0.013879 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.903 on 202 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.267, Adjusted R-squared: 0.2343
## F-statistic: 8.175 on 9 and 202 DF, p-value: 2.477e-10
Once again we have a relatively modest \(R^2\) (0.23). How do the residuals look?
mel_fit_cty <- mel_cty %>%
modelr::add_residuals(step_bic_mel_cty) %>%
modelr::add_predictions(step_bic_mel_cty)
mel_fit_cty %>%
ggplot(aes(x = pred, y = resid)) + geom_point() + labs(x = "Predicted value", y = "Residual")
We have some heteroscedasticity but doesn’t look all that bad. Is it normal?
plot(step_bic_mel_cty, which = 2)
Not ideal, but mostly ok. High-leverage points?
plot(step_bic_mel_cty, which = 5)
No high-leverage points (Cook’s D >0.5).
set.seed(15)
cv_df <- crossv_mc(mel_cty, 1000)
cv_df <- cv_df %>%
mutate(
#Summer EDD only
summer_edd = map(train, ~lm(melanoma ~ edd_Summer, data = .x)),
#Summer and fall
summer_fall = map(train, ~lm(melanoma ~ edd_Summer + edd_Fall, data = .x)),
#Four seasons
four_seasons = map(train, ~lm(melanoma ~ edd_Fall +
edd_Winter +
edd_Spring +
edd_Summer, data = .x)),
#Four seasons and state
seasons_state = map(train, ~lm(melanoma ~ ., data = .x)),
#BIC model
bic_fit = map(train, ~step_bic_mel_cty, data = .x)) %>%
mutate(
rmse_summer_edd = map2_dbl(summer_edd, test, ~rmse(model = .x, data = .y)),
rmse_summer_fall = map2_dbl(summer_fall, test, ~rmse(model = .x, data = .y)),
rmse_four_seasons = map2_dbl(four_seasons, test, ~rmse(model = .x, data = .y)),
rmse_seasons_state = map2_dbl(seasons_state, test, ~rmse(model = .x, data = .y)),
rmse_bic_fit = map2_dbl(bic_fit, test, ~rmse(model = .x, data = .y)))
cv_df %>%
select(starts_with("rmse")) %>%
pivot_longer(
everything(),
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_") %>%
mutate(model = fct_inorder(model)) %>%
ggplot(aes(x = model, y = rmse, fill = model)) + geom_violin() + labs(x = "Model", y = "RMSE")
It looks like the interaction terms matter. Just how important are they?
broom::tidy(step_bic_mel_cty) %>%
mutate(
abs_t = abs(statistic),
term = as.factor(term),
term = fct_reorder(term, abs_t)
) %>%
ggplot(aes(x = abs_t, y = term)) +
geom_col() +
labs(title = "Relative Importance of Model Terms",
y = "Model Term", x = "|t| statistic")
That’s unintuitive. One might expect summer EDD would have the biggest effect on melanoma, but in this model the winter and spring EDD terms are the most important.