ECON 413
Regression Analysis
Erol Taymaz
Department of Economics
Middle East Technical University
set.seed(123)
dat <- data.frame(x1 = rnorm(100), x2 = rnorm(100), e = rnorm(100))
# Make some random observations
dat$x1[c(1, 2)] <- NA
dat$x2[c(3, 4)] <- NA
# Make an outlier
dat$e[5] <- 15
dat$y <- dat$x1 + 2*dat$x2 + dat$e
## 'data.frame': 100 obs. of 4 variables:
## $ x1: num NA NA 1.5587 0.0705 0.1293 ...
## $ x2: num -0.71 0.257 NA NA -0.952 ...
## $ e : num 2.199 1.312 -0.265 0.543 15 ...
## $ y : num NA NA NA NA 13.2 ...
## x1 x2 e y
## Min. :-2.3092 Min. :-2.0532 Min. :-1.75653 Min. :-5.19556
## 1st Qu.:-0.4865 1st Qu.:-0.8335 1st Qu.:-0.53131 1st Qu.:-1.60513
## Median : 0.0906 Median :-0.2063 Median : 0.05345 Median : 0.01854
## Mean : 0.1003 Mean :-0.1037 Mean : 0.27461 Mean : 0.12979
## 3rd Qu.: 0.6982 3rd Qu.: 0.5005 3rd Qu.: 0.84617 3rd Qu.: 1.56870
## Max. : 2.1873 Max. : 3.2410 Max. :15.00000 Max. :13.22605
## NA's :2 NA's :2 NA's :4
# Make the oulier missing
dat$y[dat$y > 10] <- NA
# Check it
ggplot(dat, aes(x=y, y=x1)) + geom_point()
## e x1 x2 y
## 95 1 1 1 1 0
## 1 1 1 1 0 1
## 2 1 1 0 0 2
## 2 1 0 1 0 2
## 0 2 2 5 9
## [1] "lm"
## List of 13
## $ coefficients : Named num [1:3] 0.103 0.892 2.03
## ..- attr(*, "names")= chr [1:3] "(Intercept)" "x1" "x2"
## $ residuals : Named num [1:95] -0.392 -0.818 -0.784 1.485 -0.233 ...
## ..- attr(*, "names")= chr [1:95] "6" "7" "8" "9" ...
## $ effects : Named num [1:95] 0.0786 -6.9742 19.3705 1.4744 -0.3262 ...
## ..- attr(*, "names")= chr [1:95] "(Intercept)" "x1" "x2" "" ...
## $ rank : int 3
## $ fitted.values: Named num [1:95] 1.54 -1.08 -4.41 -1.28 1.57 ...
## ..- attr(*, "names")= chr [1:95] "6" "7" "8" "9" ...
## $ assign : int [1:3] 0 1 2
## $ qr :List of 5
## ..$ qr : num [1:95, 1:3] -9.747 0.103 0.103 0.103 0.103 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:95] "6" "7" "8" "9" ...
## .. .. ..$ : chr [1:3] "(Intercept)" "x1" "x2"
## .. ..- attr(*, "assign")= int [1:3] 0 1 2
## ..$ qraux: num [1:3] 1.1 1.03 1.19
## ..$ pivot: int [1:3] 1 2 3
## ..$ tol : num 1e-07
## ..$ rank : int 3
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 92
## $ na.action : 'omit' Named int [1:5] 1 2 3 4 5
## ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
## $ xlevels : Named list()
## $ call : language lm(formula = y ~ x1 + x2, data = dat)
## $ terms :Classes 'terms', 'formula' language y ~ x1 + x2
## .. ..- attr(*, "variables")= language list(y, x1, x2)
## .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:3] "y" "x1" "x2"
## .. .. .. ..$ : chr [1:2] "x1" "x2"
## .. ..- attr(*, "term.labels")= chr [1:2] "x1" "x2"
## .. ..- attr(*, "order")= int [1:2] 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(y, x1, x2)
## .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:3] "y" "x1" "x2"
## $ model :'data.frame': 95 obs. of 3 variables:
## ..$ y : num [1:95] 1.149 -1.897 -5.196 0.204 1.338 ...
## ..$ x1: num [1:95] 1.715 0.461 -1.265 -0.687 -0.446 ...
## ..$ x2: num [1:95] -0.045 -0.785 -1.668 -0.38 0.919 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language y ~ x1 + x2
## .. .. ..- attr(*, "variables")= language list(y, x1, x2)
## .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:3] "y" "x1" "x2"
## .. .. .. .. ..$ : chr [1:2] "x1" "x2"
## .. .. ..- attr(*, "term.labels")= chr [1:2] "x1" "x2"
## .. .. ..- attr(*, "order")= int [1:2] 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(y, x1, x2)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:3] "y" "x1" "x2"
## ..- attr(*, "na.action")= 'omit' Named int [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
## - attr(*, "class")= chr "lm"
## (Intercept) x1 x2
## 0.1032738 0.8917218 2.0303056
# Residuals and fitted values
resid_1 <- model_1$residuals
fitted_1 <- model_1$fitted.values
res_1 <- data.frame(resid = resid_1, fit = fitted_1)
# Residual distribution
ggplot(res_1, aes(x=resid)) + geom_density()
ggplot(res_1, aes(x=resid)) + geom_density() +
stat_function(fun=dnorm, args(list(mean=1, sd=0)), color="red")
##
## Call:
## lm(formula = y ~ x1 + x2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8394 -0.6558 -0.1030 0.6220 2.0864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.10327 0.09757 1.058 0.293
## x1 0.89172 0.10572 8.435 4.46e-13 ***
## x2 2.03031 0.09886 20.537 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9432 on 92 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8382, Adjusted R-squared: 0.8346
## F-statistic: 238.2 on 2 and 92 DF, p-value: < 2.2e-16
## [1] "summary.lm"
# Many models
model_2 <- lm(y ~ x1, data = dat)
model_3 <- lm(y ~ x2, data = dat)
model_4 <- lm(y ~ x1 + x2 - 1, data = dat)
stargazer(model_1, model_2, model_3, model_4, type="html")
Dependent variable: | ||||
y | ||||
(1) | (2) | (3) | (4) | |
x1 | 0.892*** | 0.781*** | 0.901*** | |
(0.106) | (0.248) | (0.105) | ||
x2 | 2.030*** | 1.988*** | 2.021*** | |
(0.099) | (0.131) | (0.099) | ||
Constant | 0.103 | -0.074 | 0.175 | |
(0.098) | (0.228) | (0.129) | ||
Observations | 95 | 95 | 95 | 95 |
R2 | 0.838 | 0.096 | 0.713 | 0.836 |
Adjusted R2 | 0.835 | 0.086 | 0.710 | 0.833 |
Residual Std. Error | 0.943 (df = 92) | 2.217 (df = 93) | 1.249 (df = 93) | 0.944 (df = 93) |
F Statistic | 238.231*** (df = 2; 92) | 9.897*** (df = 1; 93) | 231.047*** (df = 1; 93) | 237.367*** (df = 2; 93) |
Note: | p<0.1; p<0.05; p<0.01 |
set.seed(123)
dat <- data.frame(x1 = rnorm(90), x2 = rep(c("A", "B", "C"), each = 30), e=rnorm(90))
dat$y <- ifelse(dat$x2 == "A", 0, ifelse(dat$x2 == "B", 1, 2)) + dat$x1 + dat$e
model_f <- lm(y ~ x1 + x2, data = dat)
summary(model_f)
##
## Call:
## lm(formula = y ~ x1 + x2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8829 -0.6312 -0.1114 0.5872 3.0967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.09328 0.18456 -0.505 0.614546
## x1 1.01282 0.12070 8.391 8.53e-13 ***
## x2B 0.90742 0.26230 3.459 0.000845 ***
## x2C 2.24669 0.26103 8.607 3.11e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01 on 86 degrees of freedom
## Multiple R-squared: 0.6347, Adjusted R-squared: 0.622
## F-statistic: 49.81 on 3 and 86 DF, p-value: < 2.2e-16