In this post we will discuss how to manually implement a Difference-in-Differences (DiD) estimator in R, using simulated data.

# reproducible random numbers
set.seed(123)

# untreated and treated independent variable
# for period 0
xutr  <- rnorm(1000, mean=5)
xtr   <- rnorm(1000, mean=1)

# create a data.frame with the dep. var., indep. var., time and id vars
# for period 0
dfutr <- data.frame(time = 0, id=   1:1000, y=xutr+15+rnorm(1000), x=xutr)
dftr  <- data.frame(time = 0, id=1001:2000, y=xtr +9+rnorm(1000), x=xtr )
df0    <- rbind(dfutr, dftr)

# repeat for period 1, different means
xutr  <- rnorm(1000, mean=10)
xtr   <- rnorm(1000, mean=2 )

# repeat for period 1
dfutr <- data.frame(time = 1, id=   1:1000, y=xutr+20+rnorm(1000), x=xutr)
dftr  <- data.frame(time = 1, id=1001:2000, y=xtr +18+rnorm(1000), x=xtr )
df1    <- rbind(dfutr, dftr)

# repeat for period 2, different means again
xutr  <- rnorm(1000, mean=15)
xtr   <- rnorm(1000, mean=3 )

# repeat for period 2, now with different effect on dep. var for treated (id=1001:2000)
dfutr <- data.frame(time = 2, id=   1:1000, y=xutr+25+rnorm(1000), x=xutr)
dftr  <- data.frame(time = 2, id=1001:2000, y=xtr +22+rnorm(1000), x=xtr )
df2    <- rbind(dfutr, dftr)

Let have a look if the data came out the way we wanted it to.

# summarise data
summary(df0)
##       time         id               y                x         
##  Min.   :0   Min.   :   1.0   Min.   : 5.615   Min.   :-2.048  
##  1st Qu.:0   1st Qu.: 500.8   1st Qu.:10.052   1st Qu.: 1.055  
##  Median :0   Median :1000.5   Median :14.800   Median : 3.009  
##  Mean   :0   Mean   :1000.5   Mean   :15.015   Mean   : 3.029  
##  3rd Qu.:0   3rd Qu.:1500.2   3rd Qu.:19.957   3rd Qu.: 5.008  
##  Max.   :0   Max.   :2000.0   Max.   :24.548   Max.   : 8.241
summary(df1)
##       time         id               y               x         
##  Min.   :1   Min.   :   1.0   Min.   :15.96   Min.   :-1.262  
##  1st Qu.:1   1st Qu.: 500.8   1st Qu.:20.03   1st Qu.: 2.030  
##  Median :1   Median :1000.5   Median :24.57   Median : 6.289  
##  Mean   :1   Mean   :1000.5   Mean   :24.99   Mean   : 6.001  
##  3rd Qu.:1   3rd Qu.:1500.2   3rd Qu.:29.94   3rd Qu.: 9.967  
##  Max.   :1   Max.   :2000.0   Max.   :34.11   Max.   :13.446
summary(df2)
##       time         id               y               x         
##  Min.   :2   Min.   :   1.0   Min.   :20.57   Min.   :-0.402  
##  1st Qu.:2   1st Qu.: 500.8   1st Qu.:24.91   1st Qu.: 2.912  
##  Median :2   Median :1000.5   Median :32.11   Median : 8.800  
##  Mean   :2   Mean   :1000.5   Mean   :32.51   Mean   : 8.986  
##  3rd Qu.:2   3rd Qu.:1500.2   3rd Qu.:40.03   3rd Qu.:15.006  
##  Max.   :2   Max.   :2000.0   Max.   :44.14   Max.   :18.848

We now combine the observations into a single data.frame and add some dummy variables required for the estimation.

# combine data
df <- rbind(df0, df1, df2)

# create a dummy for post-treatment
df$post        <- ifelse(df$time  >= 2, TRUE, FALSE)
df$treated     <- ifelse(df$id <= 1000, TRUE, FALSE)
df$interaction <- df$post*df$treated

The below plot illustrates what the common trend assumption does.

library(ggplot2)
library(dplyr)

df %>%
  group_by(time, treated) %>%
  summarise(y=mean(y)) %>%
  ggplot(aes(x=time)) +
    geom_line(aes(y=y, colour=treated)) +
    geom_abline(intercept=10, slope=10, colour="red", linetype=2) # assumed common trend

plot of chunk plot

Compute the DiD estimator.

lm(y ~ x + post + interaction, data=df)
##
## Call:
## lm(formula = y ~ x + post + interaction, data = df)
##
## Coefficients:
## (Intercept)            x     postTRUE  interaction  
##      12.116        1.746        7.706       -6.005

Updated: