How to fit discrete choice models, sometimes called conditional multinomial logistic regression models, are illustrated below. Special cases of these models include the baseline multinomial logistic regression model.

Two methods are illustrated, one of which uses glm and the other is mlogit. The latter has two depreciated functions (mlogit.data and mFormula), so code that previously worked will no longer work unless you use archived version of mlogit. However, the commands below use the non-depreciated functions and package that replace these. The mlogit.data is replace by dfidx package and mFormula is replaced by Formula.

Three data sets are used in this document:

  1. Chocolates (from SAS)
  2. Brands and prices (from SAS)
  3. Modes of Transportion (from Powers & Xie)

Set up

The libraries needed

library(mlogit)
library(dfidx)    # for indexing data (replaces mlogit.data)
library(Formula)  # for createing formaula (resplace mFormula)

Chocolate Data Set

This is a hypothetical data set from SAS book on using logistic regression. A person chooses one of eight types of chocolates, Attributes of the chocolates are type of chocolate (milk/dark), whether is contain nuts (nuts/no nuts), and the texture (hard/soft).

choc <- read.table("chocolates_data.txt",header=TRUE)
head(choc, n=10)
##    subj choice dark soft nuts
## 1     1      0    0    0    0
## 2     1      0    0    0    1
## 3     1      0    0    1    0
## 4     1      0    0    1    1
## 5     1      1    1    0    0
## 6     1      0    1    0    1
## 7     1      0    1    1    0
## 8     1      0    1    1    1
## 9     2      0    0    0    0
## 10    2      0    0    0    1

a. glm (AS a Poisson regression)

The model can be fit simply using

summary( glm.choc <- glm(choice ~ dark + soft + nuts, data=choc, family=poisson) )
## 
## Call:
## glm(formula = choice ~ dark + soft + nuts, family = poisson, 
##     data = choc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0040  -0.5020  -0.3286  -0.1529   2.3468  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.9188     0.8628  -3.383 0.000717 ***
## dark          1.3863     0.7906   1.754 0.079509 .  
## soft         -2.1972     1.0541  -2.084 0.037115 *  
## nuts          0.8473     0.6901   1.228 0.219502    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 41.589  on 79  degrees of freedom
## Residual deviance: 28.727  on 76  degrees of freedom
## AIC: 56.727
## 
## Number of Fisher Scoring iterations: 6

We want the probabilities for each type

beta <- glm.choc$coefficients

#For the probabilities:      
pi0 <- exp(beta[1]+ beta[2]*choc$dark[1:8] +beta[3]*choc$soft[1:8] + beta[4]*choc$nuts[1:8])
(pi.hat <- pi0/sum(pi0))
## [1] 0.054 0.126 0.006 0.014 0.216 0.504 0.024 0.056
# For easier of knowing what's what
probs <- cbind(choc[1:8,],pi.hat)
(pi.hat <- probs[order(probs$pi.hat),])
##   subj choice dark soft nuts pi.hat
## 3    1      0    0    1    0  0.006
## 4    1      0    0    1    1  0.014
## 7    1      0    1    1    0  0.024
## 1    1      0    0    0    0  0.054
## 8    1      0    1    1    1  0.056
## 2    1      0    0    0    1  0.126
## 5    1      1    1    0    0  0.216
## 6    1      0    1    0    1  0.504

In this example, chooses have the same choice probabilities.

b. mlogit

Need to create a logical variable where “TRUE” for chocolate picked “FALSE” otherwise

choc$pick <-  choc$choice==1

Create strings for attributes of chocolates

choc$dark.s <- ifelse(choc$dark==1, "Dk","Mk")
choc$soft.s <- ifelse(choc$soft==1, "Sft","Hrd")
choc$nuts.s <- ifelse (choc$nuts==1, "Nut","Nnts")

Concatenate strings to create a variables for each attribute

choc$alt<- paste(choc$dark.s,choc$soft.s,choc$nuts.s,sep="_")
head(choc$alt, n=10)
##  [1] "Mk_Hrd_Nnts" "Mk_Hrd_Nut"  "Mk_Sft_Nnts" "Mk_Sft_Nut"  "Dk_Hrd_Nnts"
##  [6] "Dk_Hrd_Nut"  "Dk_Sft_Nnts" "Dk_Sft_Nut"  "Mk_Hrd_Nnts" "Mk_Hrd_Nut"

This is what replaced mlogit.data

c <- dfidx(choc, shape = "long", idx=list("subj", "alt"), choice="pick")
head(c, n=10)
## ~~~~~~~
##  first 10 observations out of 80 
## ~~~~~~~
##    choice dark soft nuts  pick dark.s soft.s nuts.s    idx
## 1       1    1    0    0  TRUE     Dk    Hrd   Nnts 1:Nnts
## 2       0    1    0    1 FALSE     Dk    Hrd    Nut 1:_Nut
## 3       0    1    1    0 FALSE     Dk    Sft   Nnts 1:Nnts
## 4       0    1    1    1 FALSE     Dk    Sft    Nut 1:_Nut
## 5       0    0    0    0 FALSE     Mk    Hrd   Nnts 1:Nnts
## 6       0    0    0    1 FALSE     Mk    Hrd    Nut 1:_Nut
## 7       0    0    1    0 FALSE     Mk    Sft   Nnts 1:Nnts
## 8       0    0    1    1 FALSE     Mk    Sft    Nut 1:_Nut
## 9       0    1    0    0 FALSE     Dk    Hrd   Nnts 2:Nnts
## 10      1    1    0    1  TRUE     Dk    Hrd    Nut 2:_Nut
## 
## ~~~ indexes ~~~~
##    subj         alt
## 1     1 Dk_Hrd_Nnts
## 2     1  Dk_Hrd_Nut
## 3     1 Dk_Sft_Nnts
## 4     1  Dk_Sft_Nut
## 5     1 Mk_Hrd_Nnts
## 6     1  Mk_Hrd_Nut
## 7     1 Mk_Sft_Nnts
## 8     1  Mk_Sft_Nut
## 9     2 Dk_Hrd_Nnts
## 10    2  Dk_Hrd_Nut
## indexes:  1, 2

The basic syntax of mlogit formulas is

(logical variable for choice ~ generic alternative specific | individual specific | alternative specific )

f <- Formula(pick ~ dark + soft + nuts | 0 | 0) 

model <- mlogit(f,c)
summary(model)
## 
## Call:
## mlogit(formula = pick ~ dark + soft + nuts | 0 | 0, data = c, 
##     method = "nr")
## 
## Frequencies of alternatives:choice
## Dk_Hrd_Nnts  Dk_Hrd_Nut Dk_Sft_Nnts  Dk_Sft_Nut Mk_Hrd_Nnts  Mk_Hrd_Nut 
##         0.2         0.5         0.1         0.0         0.0         0.2 
## Mk_Sft_Nnts  Mk_Sft_Nut 
##         0.0         0.0 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 1.98E-05 
## successive function values within tolerance limits 
## 
## Coefficients :
##      Estimate Std. Error z-value Pr(>|z|)  
## dark  1.38629    0.79057  1.7535  0.07951 .
## soft -2.19722    1.05409 -2.0845  0.03712 *
## nuts  0.84730    0.69007  1.2279  0.21950  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -14.363

What I am interested in are

probs <- model$probabilities[1,]
probs <- sort(probs)

as.matrix(probs,nrow=8,ncol=1)
##              [,1]
## Mk_Sft_Nnts 0.006
## Mk_Sft_Nut  0.014
## Dk_Sft_Nnts 0.024
## Mk_Hrd_Nnts 0.054
## Dk_Sft_Nut  0.056
## Mk_Hrd_Nut  0.126
## Dk_Hrd_Nnts 0.216
## Dk_Hrd_Nut  0.504

Exactly the same as found using glm.

Brands and Prices

Objects in this example has different brands of the same product and different prices.

brands <- read.table("brands_price_data.txt",header=TRUE)
head(brands)
##   combo brand price choice
## 1     1     1  5.99     12
## 2     1     2  5.99      0
## 3     1     3  5.99      0
## 4     1     4  5.99      0
## 5     1     5  4.99      0
## 6     1     1  5.99      0

a. glm

Need to do a bit of data manipulation

brands$combo <- as.factor(brands$combo)
brands$brand <- as.factor(brands$brand)

I changed the reference category

brands <- within(brands, brand <- relevel(brand, ref = "5"))
head(brands)
##   combo brand price choice
## 1     1     1  5.99     12
## 2     1     2  5.99      0
## 3     1     3  5.99      0
## 4     1     4  5.99      0
## 5     1     5  4.99      0
## 6     1     1  5.99      0

The variable combo is a nuisance variable that will be ignored but needed in the glm.

Simple Model: Additive Effects

summary( mdc1 <- glm(choice ~ combo + brand + price, data=brands, family=poisson) )
## 
## Call:
## glm(formula = choice ~ combo + brand + price, family = poisson, 
##     data = brands)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.938  -3.098  -2.336  -1.913  11.357  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.04227    0.29102   0.145 0.884525    
## combo2       0.12461    0.14558   0.856 0.392029    
## combo3       0.14800    0.14736   1.004 0.315217    
## combo4       0.07990    0.14309   0.558 0.576554    
## combo5       0.16654    0.14902   1.118 0.263749    
## combo6       0.09722    0.14391   0.676 0.499315    
## combo7       0.11997    0.14525   0.826 0.408825    
## combo8       0.26164    0.16100   1.625 0.104145    
## brand1       0.66727    0.12305   5.423 5.87e-08 ***
## brand2       0.38503    0.12962   2.970 0.002974 ** 
## brand3      -0.15954    0.14725  -1.084 0.278582    
## brand4       0.98964    0.11720   8.444  < 2e-16 ***
## price        0.14966    0.04406   3.397 0.000682 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2932.4  on 199  degrees of freedom
## Residual deviance: 2782.5  on 187  degrees of freedom
## AIC: 2991.3
## 
## Number of Fisher Scoring iterations: 7
anova(mdc1,test="LRT")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: choice
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                    199     2932.4              
## combo  7     0.00       192     2932.4 1.0000000    
## brand  4   138.25       188     2794.1 < 2.2e-16 ***
## price  1    11.64       187     2782.5 0.0006456 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interaction between Brand and Price

summary( mdc2 <- glm(choice ~ combo + brand + price + brand*price, data=brands, family=poisson) )
## 
## Call:
## glm(formula = choice ~ combo + brand + price + brand * price, 
##     family = poisson, data = brands)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.936  -3.119  -2.331  -1.921  11.347  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.117592   0.423029   0.278   0.7810    
## combo2        0.110910   0.150876   0.735   0.4623    
## combo3        0.156987   0.152905   1.027   0.3046    
## combo4        0.094163   0.146633   0.642   0.5208    
## combo5        0.149419   0.154257   0.969   0.3327    
## combo6        0.087054   0.147997   0.588   0.5564    
## combo7        0.132019   0.149901   0.881   0.3785    
## combo8        0.259622   0.161194   1.611   0.1073    
## brand1        0.662822   0.573727   1.155   0.2480    
## brand2        0.050244   0.615066   0.082   0.9349    
## brand3       -0.139973   0.709326  -0.197   0.8436    
## brand4        0.990327   0.117206   8.449   <2e-16 ***
## price         0.134778   0.075044   1.796   0.0725 .  
## brand1:price  0.001092   0.111453   0.010   0.9922    
## brand2:price  0.065965   0.118677   0.556   0.5783    
## brand3:price -0.003517   0.137077  -0.026   0.9795    
## brand4:price        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2932.4  on 199  degrees of freedom
## Residual deviance: 2782.1  on 184  degrees of freedom
## AIC: 2996.9
## 
## Number of Fisher Scoring iterations: 7
anova(mdc2,test="LRT")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: choice
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          199     2932.4              
## combo        7    0.000       192     2932.4 1.0000000    
## brand        4  138.247       188     2794.1 < 2.2e-16 ***
## price        1   11.640       187     2782.5 0.0006456 ***
## brand:price  3    0.402       184     2782.1 0.9397837    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test between the two models
anova(mdc1,mdc2)
## Analysis of Deviance Table
## 
## Model 1: choice ~ combo + brand + price
## Model 2: choice ~ combo + brand + price + brand * price
##   Resid. Df Resid. Dev Df Deviance
## 1       187     2782.5            
## 2       184     2782.1  3  0.40222

The probabilities from final model (simple)

beta <- glm.choc$coefficients
#For the probabilities:      
pi0 <- exp(beta[1]+ beta[2]*choc$dark[1:8] +beta[3]*choc$soft[1:8] + beta[4]*choc$nuts[1:8])
(pi.hat <- pi0/sum(pi0))
## [1] 0.054 0.126 0.006 0.014 0.216 0.504 0.024 0.056
# For easier of knowing what's what
probs <- cbind(choc[1:8,],pi.hat)
(pi.hat <- probs[order(probs$pi.hat),])
##   subj choice dark soft nuts  pick dark.s soft.s nuts.s         alt pi.hat
## 3    1      0    0    1    0 FALSE     Mk    Sft   Nnts Mk_Sft_Nnts  0.006
## 4    1      0    0    1    1 FALSE     Mk    Sft    Nut  Mk_Sft_Nut  0.014
## 7    1      0    1    1    0 FALSE     Dk    Sft   Nnts Dk_Sft_Nnts  0.024
## 1    1      0    0    0    0 FALSE     Mk    Hrd   Nnts Mk_Hrd_Nnts  0.054
## 8    1      0    1    1    1 FALSE     Dk    Sft    Nut  Dk_Sft_Nut  0.056
## 2    1      0    0    0    1 FALSE     Mk    Hrd    Nut  Mk_Hrd_Nut  0.126
## 5    1      1    1    0    0  TRUE     Dk    Hrd   Nnts Dk_Hrd_Nnts  0.216
## 6    1      0    1    0    1 FALSE     Dk    Hrd    Nut  Dk_Hrd_Nut  0.504

b. mlogit

Use the data that’s in individual (long) format

longer <- read.table("D:\\Dropbox\\edps 589\\8 Multicategory_logit\\brands_mdc_data.txt",header=TRUE)

Choice to be logical

longer$choice <- longer$Y==1

longer$brand <- "brand5"
longer$brand[longer$brand1==1] <- "brand1"
longer$brand[longer$brand2==1] <- "brand2"
longer$brand[longer$brand3==1] <- "brand3"
longer$brand[longer$brand4==1] <- "brand4"

Create alternatives’ descriptions in terms of price and brand

longer$alternative <- paste(longer$brand,longer$price,sep="_")

#Set up data for mlogit
long.m <- dfidx(longer, shape="long", idx=list("case","alternative"), choice="choice")

Simpel model: additive effects

fm1 <- Formula(choice ~ brand1 + brand2 + brand3 + brand4 + price | 0 | 0 ) 

summary(model.1<- mlogit(fm1,long.m) )
## 
## Call:
## mlogit(formula = choice ~ brand1 + brand2 + brand3 + brand4 + 
##     price | 0 | 0, data = long.m, method = "nr")
## 
## Frequencies of alternatives:choice
## brand1_3.99 brand1_5.99 brand2_3.99 brand2_5.99 brand3_3.99 brand3_5.99 
##     0.11000     0.13500     0.07750     0.10750     0.04750     0.06000 
## brand4_3.99 brand4_5.99 brand5_4.99 
##     0.15375     0.18375     0.12500 
## 
## nr method
## 4 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.00096 
## successive function values within tolerance limits 
## 
## Coefficients :
##        Estimate Std. Error z-value  Pr(>|z|)    
## brand1  0.66727    0.12305  5.4228 5.868e-08 ***
## brand2  0.38503    0.12962  2.9704 0.0029737 ** 
## brand3 -0.15955    0.14725 -1.0835 0.2785739    
## brand4  0.98964    0.11720  8.4439 < 2.2e-16 ***
## price   0.14966    0.04406  3.3967 0.0006819 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -1212.6

Estimated odds ratios

(exp.beta <- exp(model.1$coefficients[1:5]) )
##    brand1    brand2    brand3    brand4     price 
## 1.9489041 1.4696599 0.8525302 2.6902574 1.1614421
### Model with Interaction between brand an price
fm2 <- Formula(choice ~ brand1 + brand2 + brand3 + brand4 + price + brand1*price + brand2*price + brand3+price + brand4*price | 0 | 0 ) 

summary(model.2<- mlogit(fm2,long.m) )
## 
## Call:
## mlogit(formula = choice ~ brand1 + brand2 + brand3 + brand4 + 
##     price + brand1 * price + brand2 * price + brand3 + price + 
##     brand4 * price | 0 | 0, data = long.m, method = "nr")
## 
## Frequencies of alternatives:choice
## brand1_3.99 brand1_5.99 brand2_3.99 brand2_5.99 brand3_3.99 brand3_5.99 
##     0.11000     0.13500     0.07750     0.10750     0.04750     0.06000 
## brand4_3.99 brand4_5.99 brand5_4.99 
##     0.15375     0.18375     0.12500 
## 
## nr method
## 4 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.00202 
## successive function values within tolerance limits 
## 
## Coefficients :
##                Estimate Std. Error z-value Pr(>|z|)
## brand1        0.6452583  0.7192037  0.8972   0.3696
## brand2        0.0326760  0.7526581  0.0434   0.9654
## brand3       -0.1575240  0.1476061 -1.0672   0.2859
## brand4        0.9727619  0.6957522  1.3981   0.1621
## price         0.1312580  0.1148715  1.1427   0.2532
## brand1:price  0.0046113  0.1413352  0.0326   0.9740
## brand2:price  0.0694847  0.1471121  0.4723   0.6367
## brand4:price  0.0035195  0.1370774  0.0257   0.9795
## 
## Log-Likelihood: -1212.4
# Test interaction:
(lr <- 2*(as.numeric(model.2$logLik) - as.numeric(model.1$logLik)))
## [1] 0.4022192
1-pchisq(lr, 3)
## [1] 0.9397837

Using the simpler model (final model)

(beta <- model.1$coefficients)
##     brand1     brand2     brand3     brand4      price 
##  0.6672672  0.3850310 -0.1595466  0.9896369  0.1496624 
## attr(,"names.sup.coef")
## character(0)
## attr(,"fixed")
## brand1 brand2 brand3 brand4  price 
##  FALSE  FALSE  FALSE  FALSE  FALSE 
## attr(,"sup")
## character(0)

Modes of Transportation

The data taken from Powers & xie and is a classic example of choice of modes of transportation. This example includes bother attributes of types of transportation as well as chooser’s attributes. The Variables used are:

  1. mode= Mode of transportation chosen, 1st=train, 2nd=bus, 3rd=car
  2. ttime= Time in terminal
  3. invc= Time in vehicle
  4. gc = Generalized cost
  5. hinc = Household income
  6. id = person id

Only mlogit is used in this example

trans <- read.table("transportation_data.txt",header=TRUE)

n <- length(unique(trans$id))
trans$alt <- rep(c("Bus","Train","Car"),n) 

# set up data for mlogit
tm <- dfidx(trans, shape="long", idx=list("id","alt"), choice="mode")

Model with alternative specific (i.e., Attributes of modes of transportation )

Recall:

(logical variable for choice ~ generic alternative specific | individual specific | alternative specific )

Attributes of modes of transportation

fm.a <- Formula(mode ~ ttme + invc + invt + gc | 0  | 0) 

# Fit the model
summary(model.a <- mlogit(fm.a,tm))
## 
## Call:
## mlogit(formula = mode ~ ttme + invc + invt + gc | 0 | 0, data = tm, 
##     method = "nr")
## 
## Frequencies of alternatives:choice
##     Bus     Car   Train 
## 0.41447 0.38816 0.19737 
## 
## nr method
## 6 iterations, 0h:0m:0s 
## g'(-H)^-1g = 9.58E-05 
## successive function values within tolerance limits 
## 
## Coefficients :
##        Estimate Std. Error z-value  Pr(>|z|)    
## ttme -0.0022412  0.0071429 -0.3138  0.753696    
## invc -0.4351106  0.1327788 -3.2770  0.001049 ** 
## invt -0.0772361  0.0193517 -3.9912 6.574e-05 ***
## gc    0.4311522  0.1331899  3.2371  0.001207 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -96.349

Attributes of modes and individual speicfic

This is reported in lecture notes:

fm.b <- Formula(mode ~ ttme  + invc + invt + gc | hinc  | 0 ) 
summary(model.b <- mlogit(fm.b,tm))
## 
## Call:
## mlogit(formula = mode ~ ttme + invc + invt + gc | hinc | 0, data = tm, 
##     method = "nr")
## 
## Frequencies of alternatives:choice
##     Bus     Car   Train 
## 0.41447 0.38816 0.19737 
## 
## nr method
## 6 iterations, 0h:0m:0s 
## g'(-H)^-1g = 4.79E-07 
## gradient close to zero 
## 
## Coefficients :
##                    Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):Car   -6.101273   1.032773 -5.9077 3.470e-09 ***
## (Intercept):Train -2.100773   0.737429 -2.8488 0.0043887 ** 
## ttme              -0.073743   0.016973 -4.3447 1.395e-05 ***
## invc              -0.608922   0.154700 -3.9362 8.280e-05 ***
## invt              -0.094937   0.022480 -4.2232 2.408e-05 ***
## gc                 0.571263   0.151962  3.7592 0.0001704 ***
## hinc:Car           0.047658   0.017891  2.6638 0.0077267 ** 
## hinc:Train         0.030172   0.021380  1.4112 0.1581790    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -62.658
## McFadden R^2:  0.60839 
## Likelihood ratio test : chisq = 194.69 (p.value = < 2.22e-16)

knitr::purl(“D:/Dropbox/edps 589/8 Multicategory_logit/R_discrete_choice_models.Rmd”, output = “D:/Dropbox/edps 589/8 Multicategory_logit/R_discrete_choice_models.R”, documentation=2)