Lecture 23: Logistic Regression in R#
# Load necessary libraries
library(tidyr)
library(dplyr)
library(mlogit)
library(ggplot2)
options(repr.plot.width = 12, repr.plot.height = 8)
# 2024 ITUS Individual Data (model)
url <- "https://raw.githubusercontent.com/anmpahwa/CE5540/refs/heads/main/resources/ITUS_HHD_BD.csv"
data <- read.csv(url) # Loading Data
str(data)
'data.frame': 139487 obs. of 45 variables:
$ Unique_HH_ID : chr "2024-30010-1-241-17-13-11-2-2420-4-1" "2024-30010-1-241-17-13-11-2-2420-4-2" "2024-30010-1-241-17-13-11-2-2420-4-3" "2024-30010-1-241-17-13-11-2-2420-4-4" ...
$ q1 : int 0 0 0 0 0 0 0 0 0 0 ...
$ q2 : int 1 1 1 1 1 1 1 1 1 1 ...
$ q3 : int 0 0 0 0 0 0 0 0 0 0 ...
$ q4 : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 1 1 0 0 1 0 1 1 1 1 ...
$ weekend : int 0 0 1 1 0 1 0 0 0 0 ...
$ rural : int 1 1 1 1 1 1 1 1 1 1 ...
$ urban : int 0 0 0 0 0 0 0 0 0 0 ...
$ north : int 0 0 0 0 0 0 0 0 0 0 ...
$ west : int 1 1 1 1 1 1 1 1 1 1 ...
$ central : int 0 0 0 0 0 0 0 0 0 0 ...
$ east : int 0 0 0 0 0 0 0 0 0 0 ...
$ north.east : int 0 0 0 0 0 0 0 0 0 0 ...
$ south : int 0 0 0 0 0 0 0 0 0 0 ...
$ tierI : int 0 0 0 0 0 0 0 0 0 0 ...
$ tierII : int 1 1 1 1 1 1 1 1 1 1 ...
$ tierIII : int 0 0 0 0 0 0 0 0 0 0 ...
$ gender_ratio : num 0 0.5 0.4 0.556 0.5 0.5 0.5 0.429 0.5 0.333 ...
$ average_age : num 45 34.5 29.2 15 21.8 65.5 20.2 26 62.5 19.3 ...
$ not_married : int 1 0 0 0 0 0 0 0 0 0 ...
$ married : int 0 1 1 1 1 1 1 1 1 1 ...
$ not_literate : int 0 0 0 0 0 0 0 0 0 0 ...
$ primary : int 0 0 0 1 1 0 1 0 1 0 ...
$ secondary : int 0 0 1 0 0 0 0 0 0 1 ...
$ graduate_._above : int 1 1 0 0 0 1 0 1 0 0 ...
$ employment_ratio : num 1 0.25 0.4 0.222 0.333 0 0.333 0.286 0.5 0.333 ...
$ family_structure : num 0 0.25 0.4 0.778 0.5 0 0.667 0.286 0 0.333 ...
$ household_size : int 1 4 5 9 6 2 6 7 2 3 ...
$ disadvantaged : int 1 1 1 1 1 1 1 1 1 1 ...
$ not_disadvantaged: int 0 0 0 0 0 0 0 0 0 0 ...
$ no_land : int 0 0 0 0 0 0 0 0 0 0 ...
$ land_possessed : int 1 1 1 1 1 1 1 1 1 1 ...
$ low_income : int 1 0 0 0 0 0 0 0 1 1 ...
$ medium_income : int 0 0 1 1 1 0 1 1 0 0 ...
$ high_income : int 0 1 0 0 0 1 0 0 0 0 ...
$ no_dwelling : int 0 0 0 0 0 0 0 0 0 0 ...
$ kutcha : int 0 0 0 0 0 0 0 0 0 0 ...
$ pucca : int 1 1 1 1 1 1 1 1 1 1 ...
$ none : int 1 1 1 1 1 1 1 1 1 1 ...
$ instore : int 0 0 0 0 0 0 0 0 0 0 ...
$ online : int 0 0 0 0 0 0 0 0 0 0 ...
$ both : int 0 0 0 0 0 0 0 0 0 0 ...
$ choice : chr "none" "none" "none" "none" ...
$ weight : int 208857 208857 208857 208857 208857 208857 208857 208857 208857 208857 ...
# Counts
counts <- data %>%
summarise(
None = sum(none, na.rm = TRUE),
InStore = sum(instore, na.rm = TRUE),
Online = sum(online, na.rm = TRUE),
Both = sum(both, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "channel", values_to = "count") %>%
mutate(share = (count / sum(count)))
print(counts)
# A tibble: 4 × 3
channel count share
<chr> <int> <dbl>
1 None 100417 0.720
2 InStore 34717 0.249
3 Online 3318 0.0238
4 Both 1035 0.00742
# Temporal: Quarter Counts
quarter_counts <- data %>%
summarise(
None_Q1 = sum(none * q1, na.rm = TRUE),
InStore_Q1 = sum(instore * q1, na.rm = TRUE),
Online_Q1 = sum(online * q1, na.rm = TRUE),
Both_Q1 = sum(both * q1, na.rm = TRUE),
None_Q2 = sum(none * q2, na.rm = TRUE),
InStore_Q2 = sum(instore * q2, na.rm = TRUE),
Online_Q2 = sum(online * q2, na.rm = TRUE),
Both_Q2 = sum(both * q2, na.rm = TRUE),
None_Q3 = sum(none * q3, na.rm = TRUE),
InStore_Q3 = sum(instore * q3, na.rm = TRUE),
Online_Q3 = sum(online * q3, na.rm = TRUE),
Both_Q3 = sum(both * q3, na.rm = TRUE),
None_Q4 = sum(none * q4, na.rm = TRUE),
InStore_Q4 = sum(instore * q4, na.rm = TRUE),
Online_Q4 = sum(online * q4, na.rm = TRUE),
Both_Q4 = sum(both * q4, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "channel_quarter", values_to = "count") %>%
separate(channel_quarter, into = c("channel", "quarter"), sep = "_") %>%
group_by(quarter) %>%
mutate(share = (count / sum(count))) %>%
ungroup() %>%
mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))
# Tabulate
table <- quarter_counts %>%
ungroup() %>%
select(channel, quarter, share) %>%
pivot_wider(names_from = quarter, values_from = share)
print(table)
# A tibble: 4 × 5
channel Q1 Q2 Q3 Q4
<fct> <dbl> <dbl> <dbl> <dbl>
1 None 0.735 0.718 0.711 0.715
2 InStore 0.221 0.245 0.265 0.265
3 Online 0.0352 0.0280 0.0176 0.0144
4 Both 0.00870 0.00901 0.00640 0.00558
# Spatial: Sector Counts
sector_counts <- data %>%
summarise(
None_R = sum(none * rural, na.rm = TRUE),
InStore_R = sum(instore * rural, na.rm = TRUE),
Online_R = sum(online * rural, na.rm = TRUE),
Both_R = sum(both * rural, na.rm = TRUE),
None_U = sum(none * urban, na.rm = TRUE),
InStore_U = sum(instore * urban, na.rm = TRUE),
Online_U = sum(online * urban, na.rm = TRUE),
Both_U = sum(both * urban, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "channel_sector", values_to = "count") %>%
separate(channel_sector, into = c("channel", "sector"), sep = "_") %>%
group_by(sector) %>%
mutate(share = (count / sum(count))) %>%
ungroup() %>%
mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))
# Tabulate
table <- sector_counts %>%
ungroup() %>%
select(channel, sector, share) %>%
pivot_wider(names_from = sector, values_from = share)
print(table)
# A tibble: 4 × 3
channel R U
<fct> <dbl> <dbl>
1 None 0.742 0.687
2 InStore 0.233 0.273
3 Online 0.0195 0.0301
4 Both 0.00532 0.0105
# Prepare dataset
long_data <- data %>%
select(
Unique_HH_ID,
q1, q2, q3, q4,
weekday, weekend,
urban, rural,
north, west, south, east, central, north.east,
tierI, tierII, tierIII,
gender_ratio,
average_age,
not_married, married,
not_literate, primary, secondary, graduate_._above,
employment_ratio,
family_structure,
household_size,
disadvantaged, not_disadvantaged,
no_land, land_possessed,
low_income, medium_income, high_income,
no_dwelling, kutcha, pucca,
none, instore, online, both,
weight
) %>%
pivot_longer(
cols = c(none, instore, online, both),
names_to = "alt", values_to = "bin"
) %>%
mutate(bin = bin == 1)
model_data <- mlogit.data(
long_data,
choice = "bin",
shape = "long",
chid.var = "Unique_HH_ID",
alt.var = "alt"
)
# Esitmated Model
model <- mlogit(
formula = bin ~ 1 | q1 + q2 + q3 +
weekend +
urban +
north + west + south + east + north.east +
tierI + tierII +
gender_ratio +
average_age +
married +
primary + secondary + graduate_._above +
employment_ratio +
family_structure + household_size +
disadvantaged +
medium_income + high_income,
data = model_data,
ref = "none",
weights = model_data$weight
)
summary(model)
Call:
mlogit(formula = bin ~ 1 | q1 + q2 + q3 + weekend + urban + north +
west + south + east + north.east + tierI + tierII + gender_ratio +
average_age + married + primary + secondary + graduate_._above +
employment_ratio + family_structure + household_size + disadvantaged +
medium_income + high_income, data = model_data, weights = model_data$weight,
reflevel = "none", method = "nr")
Frequencies of alternatives:choice
none both instore online
0.719902 0.007420 0.248891 0.023787
nr method
39 iterations, 0h:2m:12s
g'(-H)^-1g = 6.71E-07
gradient close to zero
Coefficients :
Estimate Std. Error z-value Pr(>|z|)
(Intercept):both -7.50010194 0.40566244 -18.4885 < 2.2e-16 ***
(Intercept):instore -2.44871154 0.06557938 -37.3397 < 2.2e-16 ***
(Intercept):online -4.62260233 0.18586035 -24.8714 < 2.2e-16 ***
q1:both 0.27248423 0.09340207 2.9173 0.0035305 **
q1:instore -0.26262944 0.01824741 -14.3927 < 2.2e-16 ***
q1:online 0.82530056 0.05361621 15.3927 < 2.2e-16 ***
q2:both 0.38607963 0.09233283 4.1814 2.897e-05 ***
q2:instore -0.10903952 0.01785131 -6.1082 1.008e-09 ***
q2:online 0.64192999 0.05537164 11.5931 < 2.2e-16 ***
q3:both 0.08402648 0.09890217 0.8496 0.3955521
q3:instore -0.00061729 0.01757590 -0.0351 0.9719832
q3:online 0.13720431 0.06124325 2.2403 0.0250703 *
weekend:both 0.32385880 0.06735832 4.8080 1.524e-06 ***
weekend:instore 0.09871909 0.01402290 7.0398 1.924e-12 ***
weekend:online 0.11430230 0.03878328 2.9472 0.0032066 **
urban:both 0.61346000 0.07081621 8.6627 < 2.2e-16 ***
urban:instore 0.26287820 0.01439634 18.2601 < 2.2e-16 ***
urban:online 0.53330188 0.03967068 13.4432 < 2.2e-16 ***
north:both 0.37967249 0.12151451 3.1245 0.0017811 **
north:instore 0.13740852 0.02836522 4.8443 1.271e-06 ***
north:online -0.04910374 0.07312652 -0.6715 0.5019083
west:both -0.46043596 0.11209716 -4.1075 4.000e-05 ***
west:instore -0.28636864 0.02286011 -12.5270 < 2.2e-16 ***
west:online -0.40476136 0.05674728 -7.1327 9.841e-13 ***
south:both -0.08761718 0.10943684 -0.8006 0.4233525
south:instore 0.26198411 0.02167232 12.0884 < 2.2e-16 ***
south:online -0.96127596 0.07048298 -13.6384 < 2.2e-16 ***
east:both 0.54895109 0.10110520 5.4295 5.651e-08 ***
east:instore 0.76183793 0.02051790 37.1304 < 2.2e-16 ***
east:online 0.48005484 0.05040068 9.5248 < 2.2e-16 ***
north.east:both -0.19979986 0.14519082 -1.3761 0.1687848
north.east:instore 0.58275749 0.02650438 21.9872 < 2.2e-16 ***
north.east:online -0.09881252 0.07850447 -1.2587 0.2081436
tierI:both 0.02482473 0.09190630 0.2701 0.7870763
tierI:instore -0.11747671 0.01890382 -6.2144 5.151e-10 ***
tierI:online -0.29655713 0.05363646 -5.5290 3.220e-08 ***
tierII:both -0.31198596 0.08331729 -3.7446 0.0001807 ***
tierII:instore -0.18795556 0.01646939 -11.4124 < 2.2e-16 ***
tierII:online -0.25949984 0.04404134 -5.8922 3.811e-09 ***
gender_ratio:both 0.17345435 0.17556556 0.9880 0.3231650
gender_ratio:instore -0.08092477 0.03178969 -2.5456 0.0109081 *
gender_ratio:online -0.11939769 0.09190333 -1.2992 0.1938869
average_age:both 0.00416919 0.00379901 1.0974 0.2724485
average_age:instore 0.00245859 0.00071798 3.4243 0.0006164 ***
average_age:online -0.00040112 0.00202380 -0.1982 0.8428865
married:both 0.49919548 0.13375063 3.7323 0.0001898 ***
married:instore 0.51545379 0.02297252 22.4378 < 2.2e-16 ***
married:online 0.46806199 0.06808541 6.8746 6.215e-12 ***
primary:both 0.55330411 0.29164178 1.8972 0.0578009 .
primary:instore 0.41688167 0.03753774 11.1057 < 2.2e-16 ***
primary:online 0.19092222 0.11192127 1.7059 0.0880338 .
secondary:both 0.95235315 0.28746412 3.3129 0.0009232 ***
secondary:instore 0.39958238 0.03771279 10.5954 < 2.2e-16 ***
secondary:online 0.31308608 0.11116137 2.8165 0.0048550 **
graduate_._above:both 1.08344586 0.28993670 3.7368 0.0001864 ***
graduate_._above:instore 0.44132739 0.03892622 11.3375 < 2.2e-16 ***
graduate_._above:online 0.36924450 0.11372743 3.2467 0.0011673 **
employment_ratio:both -0.35945562 0.15265827 -2.3546 0.0185405 *
employment_ratio:instore -0.20944491 0.02827095 -7.4085 1.277e-13 ***
employment_ratio:online -0.64836950 0.08304906 -7.8071 5.773e-15 ***
family_structure:both -1.57971163 0.24653151 -6.4077 1.477e-10 ***
family_structure:instore -0.97345855 0.04749003 -20.4982 < 2.2e-16 ***
family_structure:online -1.12979774 0.13142917 -8.5962 < 2.2e-16 ***
household_size:both 0.22466450 0.01773154 12.6703 < 2.2e-16 ***
household_size:instore 0.11700111 0.00458290 25.5299 < 2.2e-16 ***
household_size:online 0.12111446 0.01148006 10.5500 < 2.2e-16 ***
disadvantaged:both 0.03097353 0.07273313 0.4259 0.6702158
disadvantaged:instore 0.05510470 0.01520849 3.6233 0.0002909 ***
disadvantaged:online 0.06328005 0.04106580 1.5409 0.1233308
medium_income:both 0.38778625 0.10306170 3.7627 0.0001681 ***
medium_income:instore 0.18555088 0.01724786 10.7579 < 2.2e-16 ***
medium_income:online 0.21740527 0.04948799 4.3931 1.118e-05 ***
high_income:both 0.69838947 0.10638131 6.5650 5.205e-11 ***
high_income:instore 0.29568510 0.01940481 15.2377 < 2.2e-16 ***
high_income:online 0.35946690 0.05480154 6.5594 5.401e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Log-Likelihood: -92780
McFadden R^2: 0.060581
Likelihood ratio test : chisq = 11966 (p.value = < 2.22e-16)
# Model Statistics
I <- nrow(data)
J <- length(unique(data$choice))
K <- length(coef(model))
W <- long_data$weight
Y <- long_data$bin
Z <- data %>%
summarise(
none = sum(none * weight, na.rm = TRUE) / sum(weight) * nrow(data),
instore = sum(instore * weight, na.rm = TRUE) / sum(weight) * nrow(data),
online = sum(online * weight, na.rm = TRUE) / sum(weight) * nrow(data),
both = sum(both * weight, na.rm = TRUE) / sum(weight) * nrow(data)
) %>%
pivot_longer(cols = everything(), names_to = "channel", values_to = "n") %>%
mutate(p = (n / sum(n)))
P <- Z$p[model_data$alt]
LL_EL <- sum(Y * log(1/J))
LL_MS <- sum(Y * log(P))
LL_MNL <- as.numeric(logLik(model))
R2_EL <- 1 - (LL_MNL / LL_EL)
R2_MS <- 1 - (LL_MNL / LL_MS)
AR2_EL <- 1 - ((LL_MNL - K) / LL_EL)
AR2_MS <- 1 - ((LL_MNL - K) / LL_MS)
cat("\n--- Log-likelihoods ---\n")
cat(sprintf("EL : %0.3f\n", LL_EL))
cat(sprintf("MS : %0.3f\n", LL_MS))
cat(sprintf("MNL : %0.3f\n", LL_MNL))
cat("\n--- McFadden R^2 ---\n")
cat(sprintf("R2 vs EL : %0.4f\n", R2_EL))
cat(sprintf("R2 vs MS : %0.4f\n", R2_MS))
cat("\n--- Adjusted McFadden R^2 ---\n")
cat(sprintf("Adj R2 vs EL : %0.4f (K = %d)\n", AR2_EL, K))
cat(sprintf("Adj R2 vs MS : %0.4f (K = %d)\n", AR2_MS, K))
--- Log-likelihoods ---
EL : -193370.042
MS : -98773.799
MNL : -92780.128
--- McFadden R^2 ---
R2 vs EL : 0.5202
R2 vs MS : 0.0607
--- Adjusted McFadden R^2 ---
Adj R2 vs EL : 0.5198 (K = 75)
Adj R2 vs MS : 0.0599 (K = 75)