5 Predictions from Logistic Regression Models

6 Predictions from Logistic Regression Models

6.1 Generating predicted probabilities

fit <- glm(am ~ wt + hp, data = mtcars, family = binomial)
new_cars <- data.frame(wt = c(2.2, 3.2, 3.8), hp = c(110, 110, 110))
predict(fit, newdata = new_cars, type = "response")

           1            2            3 
0.9937615708 0.0468551348 0.0003846502

You can also visualise how predicted probability changes with a predictor (holding other predictors constant).

fit <- glm(am ~ wt + hp, data = mtcars, family = binomial)
grid <- data.frame(
  wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 50),
  hp = mean(mtcars$hp)
)
grid$prob <- predict(fit, newdata = grid, type = "response")
plot(grid$wt, grid$prob, type = "l", lwd = 2,
     xlab = "Weight", ylab = "Predicted probability")

6.2 Classification thresholds

If you turn probabilities into classes (0/1), you need a threshold. Changing the threshold trades sensitivity against specificity.

threshold <- 0.5
class_hat <- factor(ifelse(prob >= threshold, 1, 0), levels = c(0, 1))
actual <- factor(mtcars$am, levels = c(0, 1))
conf_mat <- table(Predicted = class_hat, Actual = actual)
conf_mat

         Actual
Predicted  0  1
        0 18  1
        1  1 12

true_pos <- conf_mat["1", "1"]
true_neg <- conf_mat["0", "0"]
false_pos <- conf_mat["1", "0"]
false_neg <- conf_mat["0", "1"]

accuracy <- (true_pos + true_neg) / sum(conf_mat)
sensitivity <- true_pos / (true_pos + false_neg)
specificity <- true_neg / (true_neg + false_pos)

c(accuracy = accuracy, sensitivity = sensitivity, specificity = specificity)

   accuracy sensitivity specificity 
  0.9375000   0.9230769   0.9473684

6.3 ROC curves and AUC

roc_thresholds <- seq(1, 0, by = -0.02)
actual <- factor(mtcars$am, levels = c(0, 1))
actual_num <- as.integer(actual) - 1

tpr <- sapply(roc_thresholds, function(t) {
  pred <- ifelse(prob >= t, 1, 0)
  tp <- sum(pred == 1 & actual_num == 1)
  fn <- sum(pred == 0 & actual_num == 1)
  tp / (tp + fn)
})

fpr <- sapply(roc_thresholds, function(t) {
  pred <- ifelse(prob >= t, 1, 0)
  fp <- sum(pred == 1 & actual_num == 0)
  tn <- sum(pred == 0 & actual_num == 0)
  fp / (fp + tn)
})

roc_df <- data.frame(fpr = fpr, tpr = tpr)
roc_df <- roc_df[order(roc_df$fpr), ]

plot(roc_df$fpr, roc_df$tpr, type = "l", lwd = 2,
     xlab = "False positive rate", ylab = "True positive rate")
abline(0, 1, lty = 3, col = "grey60")

auc <- sum(diff(roc_df$fpr) * (head(roc_df$tpr, -1) + tail(roc_df$tpr, -1)) / 2)
auc

[1] 0.9878543

Key-point 5.1

AUC near 0.5 indicates random guessing, while values closer to 1 indicate stronger discrimination.

Exercise 5.1

Using a threshold of 0.6, compute the accuracy and store it as accuracy_06.

threshold <- 0.6
class_hat <- ifelse(prob >= threshold, 1, 0)
accuracy_06 <- mean(class_hat == actual_num)
accuracy_06