How to fit discrete choice models, sometimes called conditional multinomial logistic regression models, are illustrated below. Special cases of these models include the baseline multinomial logistic regression model.
Two methods are illustrated, one of which uses glm and the other is mlogit. The latter has two depreciated functions (mlogit.data and mFormula), so code that previously worked will no longer work unless you use archived version of mlogit. However, the commands below use the non-depreciated functions and package that replace these. The mlogit.data is replace by dfidx package and mFormula is replaced by Formula.
Three data sets are used in this document:
The libraries needed
library(mlogit)
library(dfidx) # for indexing data (replaces mlogit.data)
library(Formula) # for createing formaula (resplace mFormula)
This is a hypothetical data set from SAS book on using logistic regression. A person chooses one of eight types of chocolates, Attributes of the chocolates are type of chocolate (milk/dark), whether is contain nuts (nuts/no nuts), and the texture (hard/soft).
choc <- read.table("chocolates_data.txt",header=TRUE)
head(choc, n=10)
## subj choice dark soft nuts
## 1 1 0 0 0 0
## 2 1 0 0 0 1
## 3 1 0 0 1 0
## 4 1 0 0 1 1
## 5 1 1 1 0 0
## 6 1 0 1 0 1
## 7 1 0 1 1 0
## 8 1 0 1 1 1
## 9 2 0 0 0 0
## 10 2 0 0 0 1
The model can be fit simply using
summary( glm.choc <- glm(choice ~ dark + soft + nuts, data=choc, family=poisson) )
##
## Call:
## glm(formula = choice ~ dark + soft + nuts, family = poisson,
## data = choc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0040 -0.5020 -0.3286 -0.1529 2.3468
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9188 0.8628 -3.383 0.000717 ***
## dark 1.3863 0.7906 1.754 0.079509 .
## soft -2.1972 1.0541 -2.084 0.037115 *
## nuts 0.8473 0.6901 1.228 0.219502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 41.589 on 79 degrees of freedom
## Residual deviance: 28.727 on 76 degrees of freedom
## AIC: 56.727
##
## Number of Fisher Scoring iterations: 6
We want the probabilities for each type
beta <- glm.choc$coefficients
#For the probabilities:
pi0 <- exp(beta[1]+ beta[2]*choc$dark[1:8] +beta[3]*choc$soft[1:8] + beta[4]*choc$nuts[1:8])
(pi.hat <- pi0/sum(pi0))
## [1] 0.054 0.126 0.006 0.014 0.216 0.504 0.024 0.056
# For easier of knowing what's what
probs <- cbind(choc[1:8,],pi.hat)
(pi.hat <- probs[order(probs$pi.hat),])
## subj choice dark soft nuts pi.hat
## 3 1 0 0 1 0 0.006
## 4 1 0 0 1 1 0.014
## 7 1 0 1 1 0 0.024
## 1 1 0 0 0 0 0.054
## 8 1 0 1 1 1 0.056
## 2 1 0 0 0 1 0.126
## 5 1 1 1 0 0 0.216
## 6 1 0 1 0 1 0.504
In this example, chooses have the same choice probabilities.
Need to create a logical variable where “TRUE” for chocolate picked “FALSE” otherwise
choc$pick <- choc$choice==1
Create strings for attributes of chocolates
choc$dark.s <- ifelse(choc$dark==1, "Dk","Mk")
choc$soft.s <- ifelse(choc$soft==1, "Sft","Hrd")
choc$nuts.s <- ifelse (choc$nuts==1, "Nut","Nnts")
Concatenate strings to create a variables for each attribute
choc$alt<- paste(choc$dark.s,choc$soft.s,choc$nuts.s,sep="_")
head(choc$alt, n=10)
## [1] "Mk_Hrd_Nnts" "Mk_Hrd_Nut" "Mk_Sft_Nnts" "Mk_Sft_Nut" "Dk_Hrd_Nnts"
## [6] "Dk_Hrd_Nut" "Dk_Sft_Nnts" "Dk_Sft_Nut" "Mk_Hrd_Nnts" "Mk_Hrd_Nut"
This is what replaced mlogit.data
c <- dfidx(choc, shape = "long", idx=list("subj", "alt"), choice="pick")
head(c, n=10)
## ~~~~~~~
## first 10 observations out of 80
## ~~~~~~~
## choice dark soft nuts pick dark.s soft.s nuts.s idx
## 1 1 1 0 0 TRUE Dk Hrd Nnts 1:Nnts
## 2 0 1 0 1 FALSE Dk Hrd Nut 1:_Nut
## 3 0 1 1 0 FALSE Dk Sft Nnts 1:Nnts
## 4 0 1 1 1 FALSE Dk Sft Nut 1:_Nut
## 5 0 0 0 0 FALSE Mk Hrd Nnts 1:Nnts
## 6 0 0 0 1 FALSE Mk Hrd Nut 1:_Nut
## 7 0 0 1 0 FALSE Mk Sft Nnts 1:Nnts
## 8 0 0 1 1 FALSE Mk Sft Nut 1:_Nut
## 9 0 1 0 0 FALSE Dk Hrd Nnts 2:Nnts
## 10 1 1 0 1 TRUE Dk Hrd Nut 2:_Nut
##
## ~~~ indexes ~~~~
## subj alt
## 1 1 Dk_Hrd_Nnts
## 2 1 Dk_Hrd_Nut
## 3 1 Dk_Sft_Nnts
## 4 1 Dk_Sft_Nut
## 5 1 Mk_Hrd_Nnts
## 6 1 Mk_Hrd_Nut
## 7 1 Mk_Sft_Nnts
## 8 1 Mk_Sft_Nut
## 9 2 Dk_Hrd_Nnts
## 10 2 Dk_Hrd_Nut
## indexes: 1, 2
The basic syntax of mlogit formulas is
(logical variable for choice ~ generic alternative specific | individual specific | alternative specific )
f <- Formula(pick ~ dark + soft + nuts | 0 | 0)
model <- mlogit(f,c)
summary(model)
##
## Call:
## mlogit(formula = pick ~ dark + soft + nuts | 0 | 0, data = c,
## method = "nr")
##
## Frequencies of alternatives:choice
## Dk_Hrd_Nnts Dk_Hrd_Nut Dk_Sft_Nnts Dk_Sft_Nut Mk_Hrd_Nnts Mk_Hrd_Nut
## 0.2 0.5 0.1 0.0 0.0 0.2
## Mk_Sft_Nnts Mk_Sft_Nut
## 0.0 0.0
##
## nr method
## 5 iterations, 0h:0m:0s
## g'(-H)^-1g = 1.98E-05
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## dark 1.38629 0.79057 1.7535 0.07951 .
## soft -2.19722 1.05409 -2.0845 0.03712 *
## nuts 0.84730 0.69007 1.2279 0.21950
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -14.363
What I am interested in are
probs <- model$probabilities[1,]
probs <- sort(probs)
as.matrix(probs,nrow=8,ncol=1)
## [,1]
## Mk_Sft_Nnts 0.006
## Mk_Sft_Nut 0.014
## Dk_Sft_Nnts 0.024
## Mk_Hrd_Nnts 0.054
## Dk_Sft_Nut 0.056
## Mk_Hrd_Nut 0.126
## Dk_Hrd_Nnts 0.216
## Dk_Hrd_Nut 0.504
Exactly the same as found using glm.
Objects in this example has different brands of the same product and different prices.
brands <- read.table("brands_price_data.txt",header=TRUE)
head(brands)
## combo brand price choice
## 1 1 1 5.99 12
## 2 1 2 5.99 0
## 3 1 3 5.99 0
## 4 1 4 5.99 0
## 5 1 5 4.99 0
## 6 1 1 5.99 0
Need to do a bit of data manipulation
brands$combo <- as.factor(brands$combo)
brands$brand <- as.factor(brands$brand)
I changed the reference category
brands <- within(brands, brand <- relevel(brand, ref = "5"))
head(brands)
## combo brand price choice
## 1 1 1 5.99 12
## 2 1 2 5.99 0
## 3 1 3 5.99 0
## 4 1 4 5.99 0
## 5 1 5 4.99 0
## 6 1 1 5.99 0
The variable combo is a nuisance variable that will be ignored but needed in the glm.
summary( mdc1 <- glm(choice ~ combo + brand + price, data=brands, family=poisson) )
##
## Call:
## glm(formula = choice ~ combo + brand + price, family = poisson,
## data = brands)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.938 -3.098 -2.336 -1.913 11.357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.04227 0.29102 0.145 0.884525
## combo2 0.12461 0.14558 0.856 0.392029
## combo3 0.14800 0.14736 1.004 0.315217
## combo4 0.07990 0.14309 0.558 0.576554
## combo5 0.16654 0.14902 1.118 0.263749
## combo6 0.09722 0.14391 0.676 0.499315
## combo7 0.11997 0.14525 0.826 0.408825
## combo8 0.26164 0.16100 1.625 0.104145
## brand1 0.66727 0.12305 5.423 5.87e-08 ***
## brand2 0.38503 0.12962 2.970 0.002974 **
## brand3 -0.15954 0.14725 -1.084 0.278582
## brand4 0.98964 0.11720 8.444 < 2e-16 ***
## price 0.14966 0.04406 3.397 0.000682 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2932.4 on 199 degrees of freedom
## Residual deviance: 2782.5 on 187 degrees of freedom
## AIC: 2991.3
##
## Number of Fisher Scoring iterations: 7
anova(mdc1,test="LRT")
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: choice
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 199 2932.4
## combo 7 0.00 192 2932.4 1.0000000
## brand 4 138.25 188 2794.1 < 2.2e-16 ***
## price 1 11.64 187 2782.5 0.0006456 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary( mdc2 <- glm(choice ~ combo + brand + price + brand*price, data=brands, family=poisson) )
##
## Call:
## glm(formula = choice ~ combo + brand + price + brand * price,
## family = poisson, data = brands)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.936 -3.119 -2.331 -1.921 11.347
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.117592 0.423029 0.278 0.7810
## combo2 0.110910 0.150876 0.735 0.4623
## combo3 0.156987 0.152905 1.027 0.3046
## combo4 0.094163 0.146633 0.642 0.5208
## combo5 0.149419 0.154257 0.969 0.3327
## combo6 0.087054 0.147997 0.588 0.5564
## combo7 0.132019 0.149901 0.881 0.3785
## combo8 0.259622 0.161194 1.611 0.1073
## brand1 0.662822 0.573727 1.155 0.2480
## brand2 0.050244 0.615066 0.082 0.9349
## brand3 -0.139973 0.709326 -0.197 0.8436
## brand4 0.990327 0.117206 8.449 <2e-16 ***
## price 0.134778 0.075044 1.796 0.0725 .
## brand1:price 0.001092 0.111453 0.010 0.9922
## brand2:price 0.065965 0.118677 0.556 0.5783
## brand3:price -0.003517 0.137077 -0.026 0.9795
## brand4:price NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2932.4 on 199 degrees of freedom
## Residual deviance: 2782.1 on 184 degrees of freedom
## AIC: 2996.9
##
## Number of Fisher Scoring iterations: 7
anova(mdc2,test="LRT")
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: choice
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 199 2932.4
## combo 7 0.000 192 2932.4 1.0000000
## brand 4 138.247 188 2794.1 < 2.2e-16 ***
## price 1 11.640 187 2782.5 0.0006456 ***
## brand:price 3 0.402 184 2782.1 0.9397837
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test between the two models
anova(mdc1,mdc2)
## Analysis of Deviance Table
##
## Model 1: choice ~ combo + brand + price
## Model 2: choice ~ combo + brand + price + brand * price
## Resid. Df Resid. Dev Df Deviance
## 1 187 2782.5
## 2 184 2782.1 3 0.40222
beta <- glm.choc$coefficients
#For the probabilities:
pi0 <- exp(beta[1]+ beta[2]*choc$dark[1:8] +beta[3]*choc$soft[1:8] + beta[4]*choc$nuts[1:8])
(pi.hat <- pi0/sum(pi0))
## [1] 0.054 0.126 0.006 0.014 0.216 0.504 0.024 0.056
# For easier of knowing what's what
probs <- cbind(choc[1:8,],pi.hat)
(pi.hat <- probs[order(probs$pi.hat),])
## subj choice dark soft nuts pick dark.s soft.s nuts.s alt pi.hat
## 3 1 0 0 1 0 FALSE Mk Sft Nnts Mk_Sft_Nnts 0.006
## 4 1 0 0 1 1 FALSE Mk Sft Nut Mk_Sft_Nut 0.014
## 7 1 0 1 1 0 FALSE Dk Sft Nnts Dk_Sft_Nnts 0.024
## 1 1 0 0 0 0 FALSE Mk Hrd Nnts Mk_Hrd_Nnts 0.054
## 8 1 0 1 1 1 FALSE Dk Sft Nut Dk_Sft_Nut 0.056
## 2 1 0 0 0 1 FALSE Mk Hrd Nut Mk_Hrd_Nut 0.126
## 5 1 1 1 0 0 TRUE Dk Hrd Nnts Dk_Hrd_Nnts 0.216
## 6 1 0 1 0 1 FALSE Dk Hrd Nut Dk_Hrd_Nut 0.504
Use the data that’s in individual (long) format
longer <- read.table("D:\\Dropbox\\edps 589\\8 Multicategory_logit\\brands_mdc_data.txt",header=TRUE)
Choice to be logical
longer$choice <- longer$Y==1
longer$brand <- "brand5"
longer$brand[longer$brand1==1] <- "brand1"
longer$brand[longer$brand2==1] <- "brand2"
longer$brand[longer$brand3==1] <- "brand3"
longer$brand[longer$brand4==1] <- "brand4"
Create alternatives’ descriptions in terms of price and brand
longer$alternative <- paste(longer$brand,longer$price,sep="_")
#Set up data for mlogit
long.m <- dfidx(longer, shape="long", idx=list("case","alternative"), choice="choice")
fm1 <- Formula(choice ~ brand1 + brand2 + brand3 + brand4 + price | 0 | 0 )
summary(model.1<- mlogit(fm1,long.m) )
##
## Call:
## mlogit(formula = choice ~ brand1 + brand2 + brand3 + brand4 +
## price | 0 | 0, data = long.m, method = "nr")
##
## Frequencies of alternatives:choice
## brand1_3.99 brand1_5.99 brand2_3.99 brand2_5.99 brand3_3.99 brand3_5.99
## 0.11000 0.13500 0.07750 0.10750 0.04750 0.06000
## brand4_3.99 brand4_5.99 brand5_4.99
## 0.15375 0.18375 0.12500
##
## nr method
## 4 iterations, 0h:0m:0s
## g'(-H)^-1g = 0.00096
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## brand1 0.66727 0.12305 5.4228 5.868e-08 ***
## brand2 0.38503 0.12962 2.9704 0.0029737 **
## brand3 -0.15955 0.14725 -1.0835 0.2785739
## brand4 0.98964 0.11720 8.4439 < 2.2e-16 ***
## price 0.14966 0.04406 3.3967 0.0006819 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -1212.6
Estimated odds ratios
(exp.beta <- exp(model.1$coefficients[1:5]) )
## brand1 brand2 brand3 brand4 price
## 1.9489041 1.4696599 0.8525302 2.6902574 1.1614421
### Model with Interaction between brand an price
fm2 <- Formula(choice ~ brand1 + brand2 + brand3 + brand4 + price + brand1*price + brand2*price + brand3+price + brand4*price | 0 | 0 )
summary(model.2<- mlogit(fm2,long.m) )
##
## Call:
## mlogit(formula = choice ~ brand1 + brand2 + brand3 + brand4 +
## price + brand1 * price + brand2 * price + brand3 + price +
## brand4 * price | 0 | 0, data = long.m, method = "nr")
##
## Frequencies of alternatives:choice
## brand1_3.99 brand1_5.99 brand2_3.99 brand2_5.99 brand3_3.99 brand3_5.99
## 0.11000 0.13500 0.07750 0.10750 0.04750 0.06000
## brand4_3.99 brand4_5.99 brand5_4.99
## 0.15375 0.18375 0.12500
##
## nr method
## 4 iterations, 0h:0m:0s
## g'(-H)^-1g = 0.00202
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## brand1 0.6452583 0.7192037 0.8972 0.3696
## brand2 0.0326760 0.7526581 0.0434 0.9654
## brand3 -0.1575240 0.1476061 -1.0672 0.2859
## brand4 0.9727619 0.6957522 1.3981 0.1621
## price 0.1312580 0.1148715 1.1427 0.2532
## brand1:price 0.0046113 0.1413352 0.0326 0.9740
## brand2:price 0.0694847 0.1471121 0.4723 0.6367
## brand4:price 0.0035195 0.1370774 0.0257 0.9795
##
## Log-Likelihood: -1212.4
# Test interaction:
(lr <- 2*(as.numeric(model.2$logLik) - as.numeric(model.1$logLik)))
## [1] 0.4022192
1-pchisq(lr, 3)
## [1] 0.9397837
(beta <- model.1$coefficients)
## brand1 brand2 brand3 brand4 price
## 0.6672672 0.3850310 -0.1595466 0.9896369 0.1496624
## attr(,"names.sup.coef")
## character(0)
## attr(,"fixed")
## brand1 brand2 brand3 brand4 price
## FALSE FALSE FALSE FALSE FALSE
## attr(,"sup")
## character(0)
The data taken from Powers & xie and is a classic example of choice of modes of transportation. This example includes bother attributes of types of transportation as well as chooser’s attributes. The Variables used are:
Only mlogit is used in this example
trans <- read.table("transportation_data.txt",header=TRUE)
n <- length(unique(trans$id))
trans$alt <- rep(c("Bus","Train","Car"),n)
# set up data for mlogit
tm <- dfidx(trans, shape="long", idx=list("id","alt"), choice="mode")
Recall:
(logical variable for choice ~ generic alternative specific | individual specific | alternative specific )
Attributes of modes of transportation
fm.a <- Formula(mode ~ ttme + invc + invt + gc | 0 | 0)
# Fit the model
summary(model.a <- mlogit(fm.a,tm))
##
## Call:
## mlogit(formula = mode ~ ttme + invc + invt + gc | 0 | 0, data = tm,
## method = "nr")
##
## Frequencies of alternatives:choice
## Bus Car Train
## 0.41447 0.38816 0.19737
##
## nr method
## 6 iterations, 0h:0m:0s
## g'(-H)^-1g = 9.58E-05
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## ttme -0.0022412 0.0071429 -0.3138 0.753696
## invc -0.4351106 0.1327788 -3.2770 0.001049 **
## invt -0.0772361 0.0193517 -3.9912 6.574e-05 ***
## gc 0.4311522 0.1331899 3.2371 0.001207 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -96.349
This is reported in lecture notes:
fm.b <- Formula(mode ~ ttme + invc + invt + gc | hinc | 0 )
summary(model.b <- mlogit(fm.b,tm))
##
## Call:
## mlogit(formula = mode ~ ttme + invc + invt + gc | hinc | 0, data = tm,
## method = "nr")
##
## Frequencies of alternatives:choice
## Bus Car Train
## 0.41447 0.38816 0.19737
##
## nr method
## 6 iterations, 0h:0m:0s
## g'(-H)^-1g = 4.79E-07
## gradient close to zero
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):Car -6.101273 1.032773 -5.9077 3.470e-09 ***
## (Intercept):Train -2.100773 0.737429 -2.8488 0.0043887 **
## ttme -0.073743 0.016973 -4.3447 1.395e-05 ***
## invc -0.608922 0.154700 -3.9362 8.280e-05 ***
## invt -0.094937 0.022480 -4.2232 2.408e-05 ***
## gc 0.571263 0.151962 3.7592 0.0001704 ***
## hinc:Car 0.047658 0.017891 2.6638 0.0077267 **
## hinc:Train 0.030172 0.021380 1.4112 0.1581790
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -62.658
## McFadden R^2: 0.60839
## Likelihood ratio test : chisq = 194.69 (p.value = < 2.22e-16)
knitr::purl(“D:/Dropbox/edps 589/8 Multicategory_logit/R_discrete_choice_models.Rmd”, output = “D:/Dropbox/edps 589/8 Multicategory_logit/R_discrete_choice_models.R”, documentation=2)