Code in Chapter 01: Data, Models and Software
Example of a Conceptual Model (adapted from Fig. 15.1 in Greed, 2000)
Example of a Conceptual Model (adapted from Fig. 1.2 in Ortúzar and Willumsen, 2011)
# Function `print()` displays the argument on the screen.
# `print()` is a generic method in `R` which means that many types
# of objects can be printed, including a string as in the example
# below:
print("Hello, Discrete Choice Analysis!")
## [1] "Hello, Discrete Choice Analysis!"
# Function `rm()` removes objects from the _environment_
# that is objects currently in memory. The argument list = ls()
# removes all objects currently in memory
rm(list = ls())
library(discrtr) # A companion package for the book Introduction to Discrete Choice Analysis with `R`
library(dplyr) # A Grammar of Data Manipulation
library(ggplot2) # Create Elegant Data Visualisations Using the Grammar of Graphics
library(mlogit) # Multinomial Logit Models
library(readr) # Read Rectangular Text Data
library(stargazer) # Well-Formatted Regression and Summary Statistics Tables
# Read a csv file data and name the object
<- read_csv(system.file("extdata",
mc_mode_choice "mc_commute.csv",
package = "discrtr"),
show_col_types = FALSE)
# `head()` displays the first few rows of a data object
# Indexing of the object in this example is used to display
# only columns 1 through 4
head(mc_mode_choice[,1:4])
## # A tibble: 6 × 4
## RespondentID choice avcycle avwalk
## <dbl> <dbl> <dbl> <dbl>
## 1 566872636 3 0 1
## 2 566873140 3 0 1
## 3 566874266 3 0 0
## 4 566874842 2 1 1
## 5 566881170 2 1 1
## 6 566907438 2 0 1
# `stargazer()` takes as an input a data frame
stargazer(as.data.frame(mc_mode_choice[,1:5]),
# change the type to text, html, or latex depending on the desired output
type = "latex",
header = FALSE, # do not print package version info in the output
title = "Example of a table with summary statistics", # Title of table
omit.summary.stat = c("N",
"median"), # summary statistics to omit from output
font.size = "small") # font size can be changed
# Indexing allows us to choose parts of a data object
# In this example, we are extracting the first row of
# column `choice` in table `mc_mode_choice` and then
# The fourth row of the same column
$choice[1] - mc_mode_choice$choice[4] mc_mode_choice
## [1] 1
# Function `factor()` is used to convert a variable (which could be character or numeric)
# into a factor, that is, a label or category; when we want a factor to be ordered (treated
# as an ordinal variable) we specify argument ordered = TRUE. Non-ordinal variables by default
# are displayed alphabetically, but changing their order when specifying the labels changes
# the order they are displayed _without necessarily making them ordinal_
$choice <- factor(mc_mode_choice$choice,
mc_mode_choicelabels = c("Cycle",
"Walk",
"HSR",
"Car"))
summary(mc_mode_choice$choice)
## Cycle Walk HSR Car
## 48 711 336 281
$choice[1] - mc_mode_choice$choice[4] mc_mode_choice
## Warning in Ops.factor(mc_mode_choice$choice[1], mc_mode_choice$choice[4]): '-'
## not meaningful for factors
## [1] NA
summary(mc_mode_choice$timecycle)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.29 3.79 5.83 34014.86 100000.00 100000.00
# Find the class of an object
class(mc_mode_choice$choice)
## [1] "factor"
class(mc_mode_choice$timecycle)
## [1] "numeric"
2, 2] mc_mode_choice[
## # A tibble: 1 × 1
## choice
## <fct>
## 1 HSR
$choice[2] mc_mode_choice
## [1] HSR
## Levels: Cycle Walk HSR Car
"choice"]][2] mc_mode_choice[[
## [1] HSR
## Levels: Cycle Walk HSR Car
2:5, 7:8] mc_mode_choice[
## # A tibble: 4 × 2
## timecycle timewalk
## <dbl> <dbl>
## 1 3.73 12.8
## 2 100000 100000
## 3 5.83 20
## 4 5.83 20
<- mc_mode_choice$timecycle[mc_mode_choice$timecycle != 100000] time.Cycle.clean
class(time.Cycle.clean)
## [1] "numeric"
summary(time.Cycle.clean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2914 2.9141 4.3711 4.8957 5.8282 45.0000
<- mc_mode_choice %>% # Pipe data frame `mc_mode_choice`
time.Active.clean select(c("timecycle", # Select columns from the data frame that was piped
"timewalk")) %>%
filter(timecycle != 100000 & timewalk != 100000) # Filter observations that are _not_ 100000
<- mc_mode_choice[mc_mode_choice$timecycle != 100000 &
time.Active.clean.the.hard.way $timewalk != 100000, 7:8] mc_mode_choice
summary(time.Active.clean)
## timecycle timewalk
## Min. : 0.2914 Min. : 1.00
## 1st Qu.: 2.9141 1st Qu.:10.00
## Median : 4.3711 Median :15.00
## Mean : 4.5852 Mean :16.10
## 3rd Qu.: 5.8282 3rd Qu.:20.00
## Max. :17.4845 Max. :62.11
summary(time.Active.clean.the.hard.way)
## timecycle timewalk
## Min. : 0.2914 Min. : 1.00
## 1st Qu.: 2.9141 1st Qu.:10.00
## Median : 4.3711 Median :15.00
## Mean : 4.5852 Mean :16.10
## 3rd Qu.: 5.8282 3rd Qu.:20.00
## Max. :17.4845 Max. :62.11
summary(time.Active.clean)
## timecycle timewalk
## Min. : 0.2914 Min. : 1.00
## 1st Qu.: 2.9141 1st Qu.:10.00
## Median : 4.3711 Median :15.00
## Mean : 4.5852 Mean :16.10
## 3rd Qu.: 5.8282 3rd Qu.:20.00
## Max. :17.4845 Max. :62.11
ggplot(data = time.Active.clean) +
geom_area(aes(x = timecycle),
stat = "bin",
binwidth = 5,
fill = "blue",
color = "blue",
alpha = 0.6) +
geom_area(aes(x = timewalk),
stat = "bin",
binwidth = 5,
fill = "yellow",
color = "yellow",
alpha = 0.6)
# Initialize a `ggplot` object that will use table `time.Active.clean`
# as an input, and name it `p`
<- ggplot(data = time.Active.clean) p
# By typing the name of a ggplot object, the default
# behavior is to render it
p
+
p # Add a geometric object of type area to the plot
# Map the variable `timecycle` to the x-axis. Notice
# that the y-axis is a calculated statistic, the count
# of cases (returned by stat =bin), so we do not need
# to specify it
geom_area(aes(x = timecycle),
stat = "bin",
# The bindwidth controls the size of the bins
# needed to count the number of cases at levels
# of the variable mapped to the x-axis
binwidth = 5)
+
p geom_area(aes(x = timecycle),
stat = "bin",
binwidth = 5,
# fill controls the color of the polygon
fill = "blue",
# color controls the color of the perimeter
# of the polygon or of lines more generally
color = "black",
alpha = 0.6)
ggplot(data = time.Active.clean) +
geom_area(aes(x = timecycle),
stat = "bin",
binwidth = 5,
fill = "blue",
color = "black",
alpha = 0.6) +
# We can plot a second geometric element to the x-axis
# using a different variable from the same table
geom_area(aes(x = timewalk),
stat = "bin",
binwidth = 5,
fill = "yellow",
color = "black",
alpha = 0.6)
ggplot(data = time.Active.clean) +
geom_area(aes(x = timecycle),
stat = "bin",
binwidth = 5,
fill = "blue",
color = "black",
alpha = 0.6) +
geom_area(aes(x = timewalk),
stat = "bin",
binwidth = 5,
fill = "yellow",
color = "black",
alpha = 0.6) +
xlab("Time (in minutes)")
# The pipe operator `%>%` takes an object and passes it on
# to the next function where it is used as the first argument
%>%
mc_mode_choice # `select()` retrieves columns from a data frame
select(c("choice", "side_den")) %>%
summary()
## choice side_den
## Cycle: 48 Min. : 0.00
## Walk :711 1st Qu.:18.19
## HSR :336 Median :22.63
## Car :281 Mean :24.18
## 3rd Qu.:35.70
## Max. :59.41
# Pipe the table to `ggplot()` where it is assumed to be the
# first argument of the function, i.e., data
%>%
mc_mode_choice # Map `choice` to the x-axis and `side_den` to the y-axis
ggplot(aes(x = choice,
y = side_den)) +
# Add a geometric object of type boxplot
geom_boxplot()
library(mlogit)
data("Mode")