Lecture 23: Logistic Regression in R

Lecture 23: Logistic Regression in R#

# Load necessary libraries
library(tidyr)
library(dplyr)
library(mlogit)
library(ggplot2)
options(repr.plot.width = 12, repr.plot.height = 8)

# 2024 ITUS Individual Data (model)
url  <- "https://raw.githubusercontent.com/anmpahwa/CE5540/refs/heads/main/resources/ITUS_HHD_BD.csv"
data <- read.csv(url) # Loading Data
str(data)

'data.frame':	139487 obs. of  45 variables:
 $ Unique_HH_ID     : chr  "2024-30010-1-241-17-13-11-2-2420-4-1" "2024-30010-1-241-17-13-11-2-2420-4-2" "2024-30010-1-241-17-13-11-2-2420-4-3" "2024-30010-1-241-17-13-11-2-2420-4-4" ...
 $ q1               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ q2               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ q3               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ q4               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ weekday          : int  1 1 0 0 1 0 1 1 1 1 ...
 $ weekend          : int  0 0 1 1 0 1 0 0 0 0 ...
 $ rural            : int  1 1 1 1 1 1 1 1 1 1 ...
 $ urban            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ north            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ west             : int  1 1 1 1 1 1 1 1 1 1 ...
 $ central          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ east             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ north.east       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ south            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tierI            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tierII           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ tierIII          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ gender_ratio     : num  0 0.5 0.4 0.556 0.5 0.5 0.5 0.429 0.5 0.333 ...
 $ average_age      : num  45 34.5 29.2 15 21.8 65.5 20.2 26 62.5 19.3 ...
 $ not_married      : int  1 0 0 0 0 0 0 0 0 0 ...
 $ married          : int  0 1 1 1 1 1 1 1 1 1 ...
 $ not_literate     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ primary          : int  0 0 0 1 1 0 1 0 1 0 ...
 $ secondary        : int  0 0 1 0 0 0 0 0 0 1 ...
 $ graduate_._above : int  1 1 0 0 0 1 0 1 0 0 ...
 $ employment_ratio : num  1 0.25 0.4 0.222 0.333 0 0.333 0.286 0.5 0.333 ...
 $ family_structure : num  0 0.25 0.4 0.778 0.5 0 0.667 0.286 0 0.333 ...
 $ household_size   : int  1 4 5 9 6 2 6 7 2 3 ...
 $ disadvantaged    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ not_disadvantaged: int  0 0 0 0 0 0 0 0 0 0 ...
 $ no_land          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ land_possessed   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ low_income       : int  1 0 0 0 0 0 0 0 1 1 ...
 $ medium_income    : int  0 0 1 1 1 0 1 1 0 0 ...
 $ high_income      : int  0 1 0 0 0 1 0 0 0 0 ...
 $ no_dwelling      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ kutcha           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ pucca            : int  1 1 1 1 1 1 1 1 1 1 ...
 $ none             : int  1 1 1 1 1 1 1 1 1 1 ...
 $ instore          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ online           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ both             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ choice           : chr  "none" "none" "none" "none" ...
 $ weight           : int  208857 208857 208857 208857 208857 208857 208857 208857 208857 208857 ...

# Counts
counts <- data %>%
  summarise(
    None = sum(none, na.rm = TRUE),
    InStore = sum(instore, na.rm = TRUE),
    Online = sum(online, na.rm = TRUE),
    Both = sum(both, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel", values_to = "count") %>%
  mutate(share = (count / sum(count)))
print(counts)

# A tibble: 4 × 3
  channel  count   share
  <chr>    <int>   <dbl>
1 None    100417 0.720  
2 InStore  34717 0.249  
3 Online    3318 0.0238 
4 Both      1035 0.00742

# Temporal: Quarter Counts
quarter_counts <- data %>%
  summarise(
    None_Q1 = sum(none * q1, na.rm = TRUE),
    InStore_Q1 = sum(instore * q1, na.rm = TRUE),
    Online_Q1 = sum(online * q1, na.rm = TRUE),
    Both_Q1 = sum(both * q1, na.rm = TRUE),

    None_Q2 = sum(none * q2, na.rm = TRUE),
    InStore_Q2 = sum(instore * q2, na.rm = TRUE),
    Online_Q2 = sum(online * q2, na.rm = TRUE),
    Both_Q2 = sum(both * q2, na.rm = TRUE),

    None_Q3 = sum(none * q3, na.rm = TRUE),
    InStore_Q3 = sum(instore * q3, na.rm = TRUE),
    Online_Q3 = sum(online * q3, na.rm = TRUE),
    Both_Q3 = sum(both * q3, na.rm = TRUE),

    None_Q4 = sum(none * q4, na.rm = TRUE),
    InStore_Q4 = sum(instore * q4, na.rm = TRUE),
    Online_Q4 = sum(online * q4, na.rm = TRUE),
    Both_Q4 = sum(both * q4, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel_quarter", values_to = "count") %>%
  separate(channel_quarter, into = c("channel", "quarter"), sep = "_") %>%
  group_by(quarter) %>%
  mutate(share = (count / sum(count))) %>%
  ungroup() %>%
  mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))

# Tabulate
table <- quarter_counts %>%
  ungroup() %>%
  select(channel, quarter, share) %>%
  pivot_wider(names_from = quarter, values_from = share)
print(table)

# A tibble: 4 × 5
  channel      Q1      Q2      Q3      Q4
  <fct>     <dbl>   <dbl>   <dbl>   <dbl>
1 None    0.735   0.718   0.711   0.715  
2 InStore 0.221   0.245   0.265   0.265  
3 Online  0.0352  0.0280  0.0176  0.0144 
4 Both    0.00870 0.00901 0.00640 0.00558

# Spatial: Sector Counts
sector_counts <- data %>%
  summarise(
    None_R = sum(none * rural, na.rm = TRUE),
    InStore_R = sum(instore * rural, na.rm = TRUE),
    Online_R = sum(online * rural, na.rm = TRUE),
    Both_R = sum(both * rural, na.rm = TRUE),

    None_U = sum(none * urban, na.rm = TRUE),
    InStore_U = sum(instore * urban, na.rm = TRUE),
    Online_U = sum(online * urban, na.rm = TRUE),
    Both_U = sum(both * urban, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel_sector", values_to = "count") %>%
  separate(channel_sector, into = c("channel", "sector"), sep = "_") %>%
  group_by(sector) %>%
  mutate(share = (count / sum(count))) %>%
  ungroup() %>%
  mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))

# Tabulate
table <- sector_counts %>%
  ungroup() %>%
  select(channel, sector, share) %>%
  pivot_wider(names_from = sector, values_from = share)
print(table)

# A tibble: 4 × 3
  channel       R      U
  <fct>     <dbl>  <dbl>
1 None    0.742   0.687 
2 InStore 0.233   0.273 
3 Online  0.0195  0.0301
4 Both    0.00532 0.0105

# Prepare dataset
long_data <- data %>%
  select(
    Unique_HH_ID,
    q1, q2, q3, q4,
    weekday, weekend,
    urban, rural, 
    north, west, south, east, central, north.east,
    tierI, tierII, tierIII,
    gender_ratio,
    average_age,
    not_married, married,
    not_literate, primary, secondary, graduate_._above,
    employment_ratio,
    family_structure, 
    household_size,
    disadvantaged, not_disadvantaged,
    no_land, land_possessed,
    low_income, medium_income, high_income,
    no_dwelling, kutcha, pucca,
    none, instore, online, both,
    weight
  ) %>%
  pivot_longer(
    cols = c(none, instore, online, both),
    names_to = "alt", values_to = "bin"
  ) %>%
  mutate(bin = bin == 1)

model_data <- mlogit.data(
  long_data,
  choice = "bin",
  shape = "long",
  chid.var = "Unique_HH_ID",
  alt.var = "alt"
)

# Esitmated Model
model <- mlogit(
    formula = bin ~ 1 | q1 + q2 + q3 +
                        weekend +
                        urban +
                        north + west + south + east + north.east +
                        tierI + tierII +
                        gender_ratio + 
                        average_age +
                        married +
                        primary + secondary + graduate_._above +
                        employment_ratio + 
                        family_structure + household_size +
                        disadvantaged +
                        medium_income + high_income,
    data = model_data,
    ref = "none",
    weights = model_data$weight
    )
summary(model)

Call:
mlogit(formula = bin ~ 1 | q1 + q2 + q3 + weekend + urban + north + 
    west + south + east + north.east + tierI + tierII + gender_ratio + 
    average_age + married + primary + secondary + graduate_._above + 
    employment_ratio + family_structure + household_size + disadvantaged + 
    medium_income + high_income, data = model_data, weights = model_data$weight, 
    reflevel = "none", method = "nr")

Frequencies of alternatives:choice
    none     both  instore   online 
0.719902 0.007420 0.248891 0.023787 

nr method
39 iterations, 0h:2m:12s 
g'(-H)^-1g = 6.71E-07 
gradient close to zero 

Coefficients :
                            Estimate  Std. Error  z-value  Pr(>|z|)    
(Intercept):both         -7.50010194  0.40566244 -18.4885 < 2.2e-16 ***
(Intercept):instore      -2.44871154  0.06557938 -37.3397 < 2.2e-16 ***
(Intercept):online       -4.62260233  0.18586035 -24.8714 < 2.2e-16 ***
q1:both                   0.27248423  0.09340207   2.9173 0.0035305 ** 
q1:instore               -0.26262944  0.01824741 -14.3927 < 2.2e-16 ***
q1:online                 0.82530056  0.05361621  15.3927 < 2.2e-16 ***
q2:both                   0.38607963  0.09233283   4.1814 2.897e-05 ***
q2:instore               -0.10903952  0.01785131  -6.1082 1.008e-09 ***
q2:online                 0.64192999  0.05537164  11.5931 < 2.2e-16 ***
q3:both                   0.08402648  0.09890217   0.8496 0.3955521    
q3:instore               -0.00061729  0.01757590  -0.0351 0.9719832    
q3:online                 0.13720431  0.06124325   2.2403 0.0250703 *  
weekend:both              0.32385880  0.06735832   4.8080 1.524e-06 ***
weekend:instore           0.09871909  0.01402290   7.0398 1.924e-12 ***
weekend:online            0.11430230  0.03878328   2.9472 0.0032066 ** 
urban:both                0.61346000  0.07081621   8.6627 < 2.2e-16 ***
urban:instore             0.26287820  0.01439634  18.2601 < 2.2e-16 ***
urban:online              0.53330188  0.03967068  13.4432 < 2.2e-16 ***
north:both                0.37967249  0.12151451   3.1245 0.0017811 ** 
north:instore             0.13740852  0.02836522   4.8443 1.271e-06 ***
north:online             -0.04910374  0.07312652  -0.6715 0.5019083    
west:both                -0.46043596  0.11209716  -4.1075 4.000e-05 ***
west:instore             -0.28636864  0.02286011 -12.5270 < 2.2e-16 ***
west:online              -0.40476136  0.05674728  -7.1327 9.841e-13 ***
south:both               -0.08761718  0.10943684  -0.8006 0.4233525    
south:instore             0.26198411  0.02167232  12.0884 < 2.2e-16 ***
south:online             -0.96127596  0.07048298 -13.6384 < 2.2e-16 ***
east:both                 0.54895109  0.10110520   5.4295 5.651e-08 ***
east:instore              0.76183793  0.02051790  37.1304 < 2.2e-16 ***
east:online               0.48005484  0.05040068   9.5248 < 2.2e-16 ***
north.east:both          -0.19979986  0.14519082  -1.3761 0.1687848    
north.east:instore        0.58275749  0.02650438  21.9872 < 2.2e-16 ***
north.east:online        -0.09881252  0.07850447  -1.2587 0.2081436    
tierI:both                0.02482473  0.09190630   0.2701 0.7870763    
tierI:instore            -0.11747671  0.01890382  -6.2144 5.151e-10 ***
tierI:online             -0.29655713  0.05363646  -5.5290 3.220e-08 ***
tierII:both              -0.31198596  0.08331729  -3.7446 0.0001807 ***
tierII:instore           -0.18795556  0.01646939 -11.4124 < 2.2e-16 ***
tierII:online            -0.25949984  0.04404134  -5.8922 3.811e-09 ***
gender_ratio:both         0.17345435  0.17556556   0.9880 0.3231650    
gender_ratio:instore     -0.08092477  0.03178969  -2.5456 0.0109081 *  
gender_ratio:online      -0.11939769  0.09190333  -1.2992 0.1938869    
average_age:both          0.00416919  0.00379901   1.0974 0.2724485    
average_age:instore       0.00245859  0.00071798   3.4243 0.0006164 ***
average_age:online       -0.00040112  0.00202380  -0.1982 0.8428865    
married:both              0.49919548  0.13375063   3.7323 0.0001898 ***
married:instore           0.51545379  0.02297252  22.4378 < 2.2e-16 ***
married:online            0.46806199  0.06808541   6.8746 6.215e-12 ***
primary:both              0.55330411  0.29164178   1.8972 0.0578009 .  
primary:instore           0.41688167  0.03753774  11.1057 < 2.2e-16 ***
primary:online            0.19092222  0.11192127   1.7059 0.0880338 .  
secondary:both            0.95235315  0.28746412   3.3129 0.0009232 ***
secondary:instore         0.39958238  0.03771279  10.5954 < 2.2e-16 ***
secondary:online          0.31308608  0.11116137   2.8165 0.0048550 ** 
graduate_._above:both     1.08344586  0.28993670   3.7368 0.0001864 ***
graduate_._above:instore  0.44132739  0.03892622  11.3375 < 2.2e-16 ***
graduate_._above:online   0.36924450  0.11372743   3.2467 0.0011673 ** 
employment_ratio:both    -0.35945562  0.15265827  -2.3546 0.0185405 *  
employment_ratio:instore -0.20944491  0.02827095  -7.4085 1.277e-13 ***
employment_ratio:online  -0.64836950  0.08304906  -7.8071 5.773e-15 ***
family_structure:both    -1.57971163  0.24653151  -6.4077 1.477e-10 ***
family_structure:instore -0.97345855  0.04749003 -20.4982 < 2.2e-16 ***
family_structure:online  -1.12979774  0.13142917  -8.5962 < 2.2e-16 ***
household_size:both       0.22466450  0.01773154  12.6703 < 2.2e-16 ***
household_size:instore    0.11700111  0.00458290  25.5299 < 2.2e-16 ***
household_size:online     0.12111446  0.01148006  10.5500 < 2.2e-16 ***
disadvantaged:both        0.03097353  0.07273313   0.4259 0.6702158    
disadvantaged:instore     0.05510470  0.01520849   3.6233 0.0002909 ***
disadvantaged:online      0.06328005  0.04106580   1.5409 0.1233308    
medium_income:both        0.38778625  0.10306170   3.7627 0.0001681 ***
medium_income:instore     0.18555088  0.01724786  10.7579 < 2.2e-16 ***
medium_income:online      0.21740527  0.04948799   4.3931 1.118e-05 ***
high_income:both          0.69838947  0.10638131   6.5650 5.205e-11 ***
high_income:instore       0.29568510  0.01940481  15.2377 < 2.2e-16 ***
high_income:online        0.35946690  0.05480154   6.5594 5.401e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log-Likelihood: -92780
McFadden R^2:  0.060581 
Likelihood ratio test : chisq = 11966 (p.value = < 2.22e-16)

# Model Statistics
I      <- nrow(data)
J      <- length(unique(data$choice))
K      <- length(coef(model))
W      <- long_data$weight
Y      <- long_data$bin
Z      <- data %>%
            summarise(
              none = sum(none * weight, na.rm = TRUE) / sum(weight) * nrow(data),
              instore = sum(instore * weight, na.rm = TRUE) / sum(weight) * nrow(data),
              online = sum(online * weight, na.rm = TRUE) / sum(weight) * nrow(data),
              both = sum(both * weight, na.rm = TRUE) / sum(weight) * nrow(data)
            ) %>%
            pivot_longer(cols = everything(), names_to = "channel", values_to = "n") %>%
            mutate(p = (n / sum(n)))
P      <- Z$p[model_data$alt] 
LL_EL  <- sum(Y * log(1/J))
LL_MS  <- sum(Y * log(P))
LL_MNL <- as.numeric(logLik(model))
R2_EL  <- 1 - (LL_MNL / LL_EL)
R2_MS  <- 1 - (LL_MNL / LL_MS)
AR2_EL <- 1 - ((LL_MNL - K) / LL_EL)
AR2_MS <- 1 - ((LL_MNL - K) / LL_MS)

cat("\n--- Log-likelihoods ---\n")
cat(sprintf("EL  : %0.3f\n", LL_EL))
cat(sprintf("MS  : %0.3f\n", LL_MS))
cat(sprintf("MNL : %0.3f\n", LL_MNL))
cat("\n--- McFadden R^2 ---\n")
cat(sprintf("R2 vs EL : %0.4f\n", R2_EL))
cat(sprintf("R2 vs MS : %0.4f\n", R2_MS))
cat("\n--- Adjusted McFadden R^2 ---\n")
cat(sprintf("Adj R2 vs EL : %0.4f (K = %d)\n", AR2_EL, K))
cat(sprintf("Adj R2 vs MS : %0.4f (K = %d)\n", AR2_MS, K))

--- Log-likelihoods ---
EL  : -193370.042
MS  : -98773.799
MNL : -92780.128

--- McFadden R^2 ---
R2 vs EL : 0.5202
R2 vs MS : 0.0607

--- Adjusted McFadden R^2 ---
Adj R2 vs EL : 0.5198 (K = 75)
Adj R2 vs MS : 0.0599 (K = 75)