ECON 413
Visualization
Erol Taymaz
Department of Economics
Middle East Technical University
“The use of computer-generated, interactive, visual representations of data to amplify cognition.”
[Card, Mackinlay, & Shneiderman 1999]
Refresh our memory: Everything in R is an object
a <- c(1:5)
a
## [1] 1 2 3 4 5
sum(a)
## [1] 15
sum
## function (..., na.rm = FALSE) .Primitive("sum")
b <- plot(a)
b
## NULL
a <- rnorm(100)
b <- a + rnorm(100)
model_1 <- lm(a ~ b)
model_1
##
## Call:
## lm(formula = a ~ b)
##
## Coefficients:
## (Intercept) b
## -0.05349 0.52413
summary(a)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.34140 -0.95159 -0.02138 -0.09988 0.74250 2.49122
summary(model_1)
##
## Call:
## lm(formula = a ~ b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.77994 -0.59063 0.09254 0.48858 1.57312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05349 0.07036 -0.76 0.449
## b 0.52413 0.04580 11.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7025 on 98 degrees of freedom
## Multiple R-squared: 0.572, Adjusted R-squared: 0.5676
## F-statistic: 131 on 1 and 98 DF, p-value: < 2.2e-16
plot(a)
plot(a, b)
plot(model_1)
Analyze the data (uncover relations)
Do everything by using a script, R file(s)
library(mice)
library(ggplot2)
library(GGally)
library(ggthemes)
library(haven)
library(data.table)
dt <- read.csv("https://raw.githubusercontent.com/etaymaz/econ413/master/Data/003_data.csv")
set.seed(123)
dt <- data.frame(id = c(1:1000), exper = rnorm(1000, 0, 1),
gender = c(1:2), educ = sample(5, 1000, replace = TRUE))
dt$b[runif(100)<0.1] <- NA
dt$d[runif(100)<0.1] <- NA
dt <- within(dt, wage <- gender + (gender-1)*educ +
(exper * (gender-1) * (educ/5)) + rnorm(1000, 0, 1))
class(dt)
## [1] "data.frame"
names(dt)
## [1] "id" "exper" "gender" "educ" "wage"
dim(dt)
## [1] 1000 5
str(dt)
## 'data.frame': 1000 obs. of 5 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ exper : num -0.87 1.503 0.642 NA -0.712 ...
## $ gender: int 1 2 1 2 1 2 1 2 1 2 ...
## $ educ : int NA 3 2 NA 3 5 5 2 5 5 ...
## $ wage : num NA 6.5486 0.0909 NA -1.4579 ...
head(dt)
## id exper gender educ wage
## 1 1 -0.8699983 1 NA NA
## 2 2 1.5026771 2 3 6.54857412
## 3 3 0.6415719 1 2 0.09093394
## 4 4 NA 2 NA NA
## 5 5 -0.7122940 1 3 -1.45789677
## 6 6 2.0612542 2 5 8.11485313
tail(dt)
## id exper gender educ wage
## 995 995 -0.19416947 1 4 -0.3775560
## 996 996 NA 2 1 NA
## 997 997 -0.51001423 1 1 -0.3303003
## 998 998 -0.19601343 2 5 6.3701062
## 999 999 -0.01743695 1 5 1.1119297
## 1000 1000 NA 2 3 NA
table(dt$educ)
##
## 1 2 3 4 5
## 178 179 177 184 182
table(dt$gender, dt$educ)
##
## 1 2 3 4 5
## 1 90 85 97 100 88
## 2 88 94 80 84 94
xtabs( ~ gender + educ, data = dt)
## educ
## gender 1 2 3 4 5
## 1 90 85 97 100 88
## 2 88 94 80 84 94
md.pattern(dt)
## id gender educ exper wage
## 780 1 1 1 1 1 0
## 120 1 1 1 0 0 2
## 70 1 1 0 1 0 2
## 30 1 1 0 0 0 3
## 0 0 100 150 220 470
summary(dt)
## id exper gender educ
## Min. : 1.0 Min. :-2.64866 Min. :1.0 Min. :1.000
## 1st Qu.: 250.8 1st Qu.:-0.60161 1st Qu.:1.0 1st Qu.:2.000
## Median : 500.5 Median : 0.04126 Median :1.5 Median :3.000
## Mean : 500.5 Mean : 0.04952 Mean :1.5 Mean :3.014
## 3rd Qu.: 750.2 3rd Qu.: 0.69551 3rd Qu.:2.0 3rd Qu.:4.000
## Max. :1000.0 Max. : 3.38499 Max. :2.0 Max. :5.000
## NA's :150 NA's :100
## wage
## Min. :-1.8845
## 1st Qu.: 0.9648
## Median : 2.1786
## Mean : 2.8972
## 3rd Qu.: 4.7516
## Max. :10.8053
## NA's :220
summary(subset(dt, gender == 1))
## id exper gender educ
## Min. : 1.0 Min. :-2.64866 Min. :1 Min. :1.000
## 1st Qu.:250.5 1st Qu.:-0.58780 1st Qu.:1 1st Qu.:2.000
## Median :500.0 Median : 0.05770 Median :1 Median :3.000
## Mean :500.0 Mean : 0.05215 Mean :1 Mean :3.024
## 3rd Qu.:749.5 3rd Qu.: 0.66337 3rd Qu.:1 3rd Qu.:4.000
## Max. :999.0 Max. : 3.38499 Max. :1 Max. :5.000
## NA's :50 NA's :40
## wage
## Min. :-1.8845
## 1st Qu.: 0.3509
## Median : 1.0158
## Mean : 0.9996
## 3rd Qu.: 1.6446
## Max. : 3.6328
## NA's :90
summary(subset(dt, gender == 2))
## id exper gender educ
## Min. : 2.0 Min. :-2.57297 Min. :2 Min. :1.000
## 1st Qu.: 251.5 1st Qu.:-0.61461 1st Qu.:2 1st Qu.:2.000
## Median : 501.0 Median : 0.03077 Median :2 Median :3.000
## Mean : 501.0 Mean : 0.04656 Mean :2 Mean :3.005
## 3rd Qu.: 750.5 3rd Qu.: 0.74073 3rd Qu.:2 3rd Qu.:4.000
## Max. :1000.0 Max. : 3.11620 Max. :2 Max. :5.000
## NA's :100 NA's :60
## wage
## Min. : 0.4935
## 1st Qu.: 3.6629
## Median : 4.8601
## Mean : 4.9998
## 3rd Qu.: 6.4615
## Max. :10.8053
## NA's :130
# Correlations - variables 2-4
cor(dt[, 2:5])
## exper gender educ wage
## exper 1 NA NA NA
## gender NA 1 NA NA
## educ NA NA 1 NA
## wage NA NA NA 1
cor(dt[, 2:5], use = "complete.obs")
## exper gender educ wage
## exper 1.00000000 -0.0104530077 0.0236341481 0.09952051
## gender -0.01045301 1.0000000000 0.0009589141 0.80603867
## educ 0.02363415 0.0009589141 1.0000000000 0.27049172
## wage 0.09952051 0.8060386732 0.2704917170 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs")
## exper gender educ wage
## exper 1.000000000 -0.002892561 0.023634148 0.09952051
## gender -0.002892561 1.000000000 -0.006840444 0.80603867
## educ 0.023634148 -0.006840444 1.000000000 0.27049172
## wage 0.099520506 0.806038673 0.270491717 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "kendall")
## exper gender educ wage
## exper 1.000000000 0.006545028 0.018319032 0.04909647
## gender 0.006545028 1.000000000 -0.005881101 0.67278394
## educ 0.018319032 -0.005881101 1.000000000 0.14863020
## wage 0.049096475 0.672783940 0.148630199 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "spearman")
## exper gender educ wage
## exper 1.000000000 0.008011278 0.025838438 0.07262175
## gender 0.008011278 1.000000000 -0.006575184 0.82346099
## educ 0.025838438 -0.006575184 1.000000000 0.19371484
## wage 0.072621750 0.823460990 0.193714836 1.00000000
library(GGally)
ggpairs(dt)
model1 <- lm(wage ~ gender + educ + exper, data = dt)
summary(model1)
##
## Call:
## lm(formula = wage ~ gender + educ + exper, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4503 -0.8866 0.0170 0.8475 4.2938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.42967 0.17362 -25.513 < 2e-16 ***
## gender 4.00421 0.09210 43.476 < 2e-16 ***
## educ 0.47069 0.03268 14.401 < 2e-16 ***
## exper 0.26214 0.04788 5.475 5.91e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.284 on 776 degrees of freedom
## (220 observations deleted due to missingness)
## Multiple R-squared: 0.7328, Adjusted R-squared: 0.7317
## F-statistic: 709.3 on 3 and 776 DF, p-value: < 2.2e-16
str(model1)
## List of 13
## $ coefficients : Named num [1:4] -4.43 4.004 0.471 0.262
## ..- attr(*, "names")= chr [1:4] "(Intercept)" "gender" "educ" "exper"
## $ residuals : Named num [1:780] 1.164 -0.593 -2.258 1.642 1.748 ...
## ..- attr(*, "names")= chr [1:780] "2" "3" "5" "6" ...
## $ effects : Named num [1:780] -80.91 55.79 -18.67 -7.03 1.87 ...
## ..- attr(*, "names")= chr [1:780] "(Intercept)" "gender" "educ" "exper" ...
## $ rank : int 4
## $ fitted.values: Named num [1:780] 5.385 0.684 0.8 6.473 1.87 ...
## ..- attr(*, "names")= chr [1:780] "2" "3" "5" "6" ...
## $ assign : int [1:4] 0 1 2 3
## $ qr :List of 5
## ..$ qr : num [1:780, 1:4] -27.9285 0.0358 0.0358 0.0358 0.0358 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:780] "2" "3" "5" "6" ...
## .. .. ..$ : chr [1:4] "(Intercept)" "gender" "educ" "exper"
## .. ..- attr(*, "assign")= int [1:4] 0 1 2 3
## ..$ qraux: num [1:4] 1.04 1.04 1 1.07
## ..$ pivot: int [1:4] 1 2 3 4
## ..$ tol : num 1e-07
## ..$ rank : int 4
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 776
## $ na.action : 'omit' Named int [1:220] 1 4 10 20 27 28 32 39 40 42 ...
## ..- attr(*, "names")= chr [1:220] "1" "4" "10" "20" ...
## $ xlevels : Named list()
## $ call : language lm(formula = wage ~ gender + educ + exper, data = dt)
## $ terms :Classes 'terms', 'formula' language wage ~ gender + educ + exper
## .. ..- attr(*, "variables")= language list(wage, gender, educ, exper)
## .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:4] "wage" "gender" "educ" "exper"
## .. .. .. ..$ : chr [1:3] "gender" "educ" "exper"
## .. ..- attr(*, "term.labels")= chr [1:3] "gender" "educ" "exper"
## .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(wage, gender, educ, exper)
## .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:4] "wage" "gender" "educ" "exper"
## $ model :'data.frame': 780 obs. of 4 variables:
## ..$ wage : num [1:780] 6.5486 0.0909 -1.4579 8.1149 3.6185 ...
## ..$ gender: int [1:780] 2 1 1 2 1 2 1 1 2 1 ...
## ..$ educ : int [1:780] 3 2 3 5 5 2 5 4 3 2 ...
## ..$ exper : num [1:780] 1.503 0.642 -0.712 2.061 -0.22 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language wage ~ gender + educ + exper
## .. .. ..- attr(*, "variables")= language list(wage, gender, educ, exper)
## .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:4] "wage" "gender" "educ" "exper"
## .. .. .. .. ..$ : chr [1:3] "gender" "educ" "exper"
## .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "educ" "exper"
## .. .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(wage, gender, educ, exper)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:4] "wage" "gender" "educ" "exper"
## ..- attr(*, "na.action")= 'omit' Named int [1:220] 1 4 10 20 27 28 32 39 40 42 ...
## .. ..- attr(*, "names")= chr [1:220] "1" "4" "10" "20" ...
## - attr(*, "class")= chr "lm"
ggplot(dt, aes(x = exper)) + geom_histogram()
ggplot(dt, aes(x = exper)) + geom_density()
ggplot(dt, aes(x = exper)) + geom_density() +
stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col="blue")
ggplot(dt, aes(x = educ)) + geom_histogram()
ggplot(dt, aes(x = educ, y = wage)) + geom_point()
ggplot(dt, aes(x = educ, y = wage)) + geom_point(size=5)
ggplot(dt, aes(x = educ, y = wage)) + geom_point(size=5, alpha=0.1)
ggplot(dt, aes(x = educ, y = wage, color = as.factor(gender))) + geom_point(size=5, alpha=0.1)
# Bivariate plots
ggplot(dt, aes(x = exper, y = wage)) + geom_point()
ggplot(dt, aes(x = exper, y = wage)) + geom_point() +
geom_smooth(formula = y ~ x)
ggplot(dt, aes(x = exper, y = wage)) + geom_point() +
geom_smooth(formula = y ~ x, se = FALSE, method = "lm")
ggplot(dt, aes(x = exper, y = wage)) + geom_point() + facet_wrap(~ gender) +
geom_smooth(formula = y ~ x, se = FALSE, method = "lm")
ggplot(dt, aes(x = exper, y = wage)) + geom_point() + facet_wrap(gender ~ educ, nrow = 2) +
geom_smooth(formula = y ~ x, se = FALSE, method = "lm")
dt$educ <- as.factor(dt$educ)
model2 <- lm(wage ~ educ + exper + educ*exper, data = subset(dt, gender == 1))
summary(model2)
##
## Call:
## lm(formula = wage ~ educ + exper + educ * exper, data = subset(dt,
## gender == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73114 -0.61771 -0.00812 0.62456 2.42288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.07431 0.10991 9.774 <2e-16 ***
## educ2 0.03364 0.15597 0.216 0.8293
## educ3 -0.28991 0.15232 -1.903 0.0577 .
## educ4 -0.22512 0.15384 -1.463 0.1441
## educ5 0.13026 0.15806 0.824 0.4103
## exper 0.07463 0.11116 0.671 0.5024
## educ2:exper -0.17990 0.14946 -1.204 0.2294
## educ3:exper 0.14172 0.15716 0.902 0.3677
## educ4:exper -0.02000 0.16431 -0.122 0.9032
## educ5:exper -0.03407 0.17713 -0.192 0.8476
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9827 on 400 degrees of freedom
## (90 observations deleted due to missingness)
## Multiple R-squared: 0.03855, Adjusted R-squared: 0.01692
## F-statistic: 1.782 on 9 and 400 DF, p-value: 0.06979
model3 <- lm(wage ~ educ + exper + educ*exper, data = subset(dt, gender == 2))
summary(model3)
##
## Call:
## lm(formula = wage ~ educ + exper + educ * exper, data = subset(dt,
## gender == 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.86437 -0.77670 -0.01112 0.75068 2.91145
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.00383 0.12360 24.303 < 2e-16 ***
## educ2 1.05102 0.17146 6.130 2.31e-09 ***
## educ3 2.03922 0.17726 11.504 < 2e-16 ***
## educ4 2.96237 0.17822 16.622 < 2e-16 ***
## educ5 3.87530 0.17191 22.543 < 2e-16 ***
## exper 0.05794 0.11777 0.492 0.623058
## educ2:exper 0.23215 0.17072 1.360 0.174750
## educ3:exper 0.61271 0.16890 3.628 0.000327 ***
## educ4:exper 0.86182 0.18209 4.733 3.19e-06 ***
## educ5:exper 1.04711 0.18331 5.712 2.34e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 360 degrees of freedom
## (130 observations deleted due to missingness)
## Multiple R-squared: 0.6858, Adjusted R-squared: 0.6779
## F-statistic: 87.31 on 9 and 360 DF, p-value: < 2.2e-16
Source: William Playfair’s Time Series of Exports and Imports of Denmark and Norway, Public Domain, https://commons.wikimedia.org/w/index.php?curid=12674977
Source: Michael Sandberg’s Data Visualization Blog, https://datavizblog.com/2013/05/26/dataviz-history-charles-minards-flow-map-of-napoleons-russian-campaign-of-1812-part-5/
Choose the graphical tool
Source: http://curleylab.psych.columbia.edu/netviz/netviz1.html
Design the components
Source: Stephanie Evergreen and Ann K. Emery, Data Visualization Checklist, http://AnnKEmery.com/blog
Bar chart vs pie chart
Source: Stephanie Evergreen and Ann K. Emery, Data Visualization Checklist, http://AnnKEmery.com/blog
Source: Angie Ficek Data Visualization Techniques: How to Tell Your Story to Make it Stick, http://www.cehd.umn.edu/olpd/mesi/spring/2015/ficek-datavis.pdf
geoms
geom’s define chart type. Variables should be map to aesthetics.
scales
Scales control the mapping between data and aesthetics.
Coordinate systems
Coordinate systems adjust the mapping from coordinates to the 2d plane of the computer screen.
Faceting
Facets display subsets of the dataset in different panels.
For more information, see ggplot2 web site: http://docs.ggplot2.org/current/
# Read the data
dt <- read.csv("https://raw.githubusercontent.com/etaymaz/econ413/master/Data/003_data.csv")
# Make it data table
dt <- data.table(dt)
# Scatter chart with 2 variables
ggplot(dt, aes(x = exper, y = wage)) + geom_point()
# Scatter chart with 3 variables - continuous color
ggplot(dt, aes(x = exper, y = wage, color = -educ)) + geom_point()
# Scatter chart with 3 variables - discrete color
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ))) + geom_point()
# Scatter chart with 4 variables
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + geom_point()
# Scatter chart with 4 variables and large shapes
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) +
geom_point(size=3)
# Scatter chart with 5 variables
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender), alpha = educ)) +
geom_point(size=3)
# Scatter chart with 4 variables and large and transparent shapes
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) +
geom_point(size=3, alpha = 0.5)
# Calculate mean values by gender and education
dtm <- dt[, list(mwage = mean(wage, na.rm = T), mexper = mean(exper, na.rm = T)), by = list(gender, educ)]
# Line chart with 3 variables
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line()
# Line chart with 3 variables - thick line
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line(size = 2)
# Line chart with 3 variables with points
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line() + geom_point()
# Line chart with 3 variables with points - point size by mean experience
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line() + geom_point(aes(size = mexper))
# Bar chart with 1 variable
ggplot(dt, aes(x = educ)) + geom_bar()
# Bar chart with 2 variables
ggplot(dt, aes(x = educ, weight = wage)) + geom_bar()
# Bar chart with 3 variables
ggplot(dt, aes(x = educ, weight = wage, fill = as.factor(educ))) + geom_bar()
# Bar chart with 3 variables
ggplot(dt, aes(x = educ, weight = wage, fill = as.factor(gender))) + geom_bar()
# Download the data
pwt <- read_stata("http://www.rug.nl/ggdc/docs/pwt90.dta")
# Make the data a data.table object
pwt <- data.table(pwt)
# Class
class(pwt)
## [1] "data.table" "data.frame"
# Variable names
names(pwt)
## [1] "countrycode" "country" "currency_unit" "year"
## [5] "rgdpe" "rgdpo" "pop" "emp"
## [9] "avh" "hc" "ccon" "cda"
## [13] "cgdpe" "cgdpo" "ck" "ctfp"
## [17] "cwtfp" "rgdpna" "rconna" "rdana"
## [21] "rkna" "rtfpna" "rwtfpna" "labsh"
## [25] "delta" "xr" "pl_con" "pl_da"
## [29] "pl_gdpo" "i_cig" "i_xm" "i_xr"
## [33] "i_outlier" "cor_exp" "statcap" "csh_c"
## [37] "csh_i" "csh_g" "csh_x" "csh_m"
## [41] "csh_r" "pl_c" "pl_i" "pl_g"
## [45] "pl_x" "pl_m" "pl_k"
# Dimension
dim(pwt)
## [1] 11830 47
# Select a subset of variables
pwts <- pwt[, list(countrycode, year, rgdpo, pop, emp, avh, hc, rgdpna, rkna)]
# Structure
str(pwts)
## Classes 'data.table' and 'data.frame': 11830 obs. of 9 variables:
## $ countrycode: chr "ABW" "ABW" "ABW" "ABW" ...
## ..- attr(*, "label")= chr "3-letter ISO country code"
## ..- attr(*, "format.stata")= chr "%9s"
## $ year : num 1950 1951 1952 1953 1954 ...
## ..- attr(*, "label")= chr "Year"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rgdpo : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Output-side real GDP at chained PPPs (in mil. 2011US$)"
## ..- attr(*, "format.stata")= chr "%14.3g"
## $ pop : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Population (in millions)"
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ emp : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Number of persons engaged (in millions)"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ avh : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Average annual hours worked by persons engaged (source: The Conference Board)"
## ..- attr(*, "format.stata")= chr "%10.0gc"
## $ hc : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Human capital index, see note hc"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rgdpna : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Real GDP at constant 2011 national prices (in mil. 2011US$)"
## ..- attr(*, "format.stata")= chr "%14.3g"
## $ rkna : num NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Capital stock at constant 2011 national prices (in mil. 2011US$)"
## ..- attr(*, "format.stata")= chr "%14.3g"
## - attr(*, ".internal.selfref")=<externalptr>
# Summary statistics
summary(pwts)
## countrycode year rgdpo pop
## Length:11830 Min. :1950 Min. : 1 Min. : 0.0044
## Class :character 1st Qu.:1966 1st Qu.: 6045 1st Qu.: 1.5934
## Mode :character Median :1982 Median : 24874 Median : 5.9857
## Mean :1982 Mean : 250855 Mean : 30.0906
## 3rd Qu.:1998 3rd Qu.: 132475 3rd Qu.: 19.5332
## Max. :2014 Max. :17135952 Max. :1369.4357
## NA's :2391 NA's :2391
## emp avh hc rgdpna
## Min. : 0.001 Min. :1363 Min. :1.007 Min. : 8
## 1st Qu.: 0.942 1st Qu.:1816 1st Qu.:1.408 1st Qu.: 6277
## Median : 2.976 Median :1979 Median :1.917 Median : 29303
## Mean : 14.219 Mean :1996 Mean :2.033 Mean : 277098
## 3rd Qu.: 8.404 3rd Qu.:2177 3rd Qu.:2.609 3rd Qu.: 167537
## Max. :798.368 Max. :3042 Max. :3.734 Max. :17150538
## NA's :3586 NA's :8511 NA's :3963 NA's :2391
## rkna
## Min. : 31
## 1st Qu.: 16958
## Median : 85975
## Mean : 899559
## 3rd Qu.: 454699
## Max. :67590072
## NA's :2421
# Missing observations
md.pattern(pwts)
## countrycode year rgdpo pop rgdpna rkna emp hc avh
## 3289 1 1 1 1 1 1 1 1 1 0
## 4006 1 1 1 1 1 1 1 1 0 1
## 30 1 1 1 1 1 1 1 0 1 1
## 919 1 1 1 1 1 1 1 0 0 2
## 572 1 1 1 1 1 1 0 1 0 2
## 593 1 1 1 1 1 1 0 0 0 3
## 30 1 1 1 1 1 0 0 0 0 4
## 2391 1 1 0 0 0 0 0 0 0 7
## 0 0 2391 2391 2391 2421 3586 3963 8511 25654
# Generate GDP per capita variable - 000 USD
pwts[, gdpc := (rgdpo / pop) / 1000]
# Check the data
summary(pwts)
## countrycode year rgdpo pop
## Length:11830 Min. :1950 Min. : 1 Min. : 0.0044
## Class :character 1st Qu.:1966 1st Qu.: 6045 1st Qu.: 1.5934
## Mode :character Median :1982 Median : 24874 Median : 5.9857
## Mean :1982 Mean : 250855 Mean : 30.0906
## 3rd Qu.:1998 3rd Qu.: 132475 3rd Qu.: 19.5332
## Max. :2014 Max. :17135952 Max. :1369.4357
## NA's :2391 NA's :2391
## emp avh hc rgdpna
## Min. : 0.001 Min. :1363 Min. :1.007 Min. : 8
## 1st Qu.: 0.942 1st Qu.:1816 1st Qu.:1.408 1st Qu.: 6277
## Median : 2.976 Median :1979 Median :1.917 Median : 29303
## Mean : 14.219 Mean :1996 Mean :2.033 Mean : 277098
## 3rd Qu.: 8.404 3rd Qu.:2177 3rd Qu.:2.609 3rd Qu.: 167537
## Max. :798.368 Max. :3042 Max. :3.734 Max. :17150538
## NA's :3586 NA's :8511 NA's :3963 NA's :2391
## rkna gdpc
## Min. : 31 Min. : 0.133
## 1st Qu.: 16958 1st Qu.: 2.019
## Median : 85975 Median : 5.410
## Mean : 899559 Mean : 19.478
## 3rd Qu.: 454699 3rd Qu.: 13.703
## Max. :67590072 Max. :4447.840
## NA's :2421 NA's :2391
# Find the outlier
pwts[gdpc > 1000]
## countrycode year rgdpo pop emp avh hc rgdpna
## 1: BMU 1970 118924.79 0.052286 NA NA NA 1634.187
## 2: BMU 1971 214744.45 0.052857 NA NA NA 1690.362
## 3: BMU 1972 168675.36 0.053426 NA NA NA 1721.003
## 4: BMU 1973 163793.48 0.053980 NA NA NA 1746.537
## 5: BMU 1974 148884.08 0.054507 NA NA NA 1761.858
## 6: BMU 1975 158232.27 0.054994 NA NA NA 1828.246
## 7: BMU 1976 194509.84 0.055439 NA NA NA 1991.665
## 8: BMU 1977 182189.16 0.055851 NA NA NA 2042.242
## 9: BMU 1978 179753.22 0.056237 NA NA NA 2083.601
## 10: BMU 1979 180668.77 0.056613 NA NA NA 2220.414
## 11: BMU 1980 154140.38 0.056990 NA NA NA 2304.830
## 12: BMU 1981 150503.89 0.057370 NA NA NA 2223.688
## 13: BMU 1982 141148.67 0.057752 NA NA NA 2225.750
## 14: BMU 1983 146613.58 0.058138 NA NA NA 2245.035
## 15: BMU 1984 155851.80 0.058528 NA NA NA 2199.310
## 16: BMU 1985 119540.04 0.058923 NA NA NA 2292.701
## 17: BMU 1986 169751.81 0.059324 0.0334450 NA NA 2394.340
## 18: BMU 1987 265669.47 0.059730 0.0352440 NA NA 2489.551
## 19: BMU 1988 182990.47 0.060138 0.0364200 NA NA 2522.420
## 20: BMU 1989 96493.66 0.060539 0.0362370 NA NA 2525.573
## 21: BMU 1990 168599.41 0.060931 0.0355730 NA NA 2458.744
## 22: BMU 1991 179122.67 0.061311 0.0346210 NA NA 2465.172
## 23: BMU 1992 146723.64 0.061680 0.0336500 NA NA 2397.979
## 24: BMU 1993 156847.22 0.062036 0.0334270 NA NA 2456.076
## 25: BMU 1995 135433.84 0.062694 0.0341330 NA NA 2629.517
## 26: BMU 1996 113023.91 0.062989 0.0346330 NA NA 2731.399
## 27: BMU 1997 183042.34 0.063257 0.0352960 NA NA 2857.200
## 28: BMU 2000 80290.86 0.064035 0.0374475 NA NA 3161.136
## 29: BMU 2002 134871.80 0.064614 0.0378150 NA NA 3339.322
## countrycode year rgdpo pop emp avh hc rgdpna
## rkna gdpc
## 1: 5616.710 2274.505
## 2: 5655.335 4062.744
## 3: 5728.377 3157.177
## 4: 5789.289 3034.337
## 5: 5849.582 2731.467
## 6: 5927.734 2877.264
## 7: 5968.023 3508.538
## 8: 6053.705 3262.057
## 9: 6163.846 3196.351
## 10: 6307.740 3191.295
## 11: 6537.652 2704.692
## 12: 6706.583 2623.390
## 13: 6959.545 2444.048
## 14: 7217.527 2521.820
## 15: 7533.817 2662.859
## 16: 7784.550 2028.750
## 17: 7934.764 2861.436
## 18: 8121.300 4447.840
## 19: 8423.078 3042.843
## 20: 8607.367 1593.909
## 21: 8666.035 2767.055
## 22: 8659.007 2921.542
## 23: 8664.764 2378.788
## 24: 8685.491 2528.326
## 25: 8728.345 2160.236
## 26: 8880.891 1794.344
## 27: 9099.609 2893.630
## 28: 10313.338 1253.859
## 29: 10428.872 2087.346
## rkna gdpc
# Drop the outlier
pwts <- pwts[!countrycode == "BMU"]
# Plot the GDP per capita data for all countries
# Define labels
clabs <- labs(title = "GDP per capita",
subtitle = "All countries",
x = "", y = "Real GDP, PPP, 2011 prices",
caption = "Source: Penn World Tables 9.0")
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
geom_line() +
clabs
# Remove legends
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
geom_line() +
theme(legend.position="none") +
clabs
# Makegdpc logarithmic
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
geom_line() +
theme(legend.position="none") +
scale_y_log10() +
clabs
# Plot for a set of countries
clist <- c("TUR", "USA", "IND", "CHN", "KOR", "ESP")
pwtc <- subset(pwts, countrycode %in% clist)
ggplot(data = pwtc, mapping = aes(x = year, y = gdpc)) +
geom_line() +
scale_y_log10() +
facet_wrap(~ countrycode, nrow = 3, ncol = 3) +
clabs
ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
geom_line() +
scale_y_log10() +
clabs
# Labels
clabs <- labs(title = "GDP-human capital relation",
subtitle = "All countries",
x = "Human capital index", y = "Real GDP, PPP, 2011 prices",
caption = "Source: Penn World Tables 9.0")
ggplot(data = pwts, mapping = aes(x = hc, y = gdpc, color = -year)) +
geom_point() +
scale_y_log10() +
clabs
# GDP per capita and education - selected years
ylist <- seq(1950, 2010, by = 10)
ggplot(data = subset(pwts, year %in% ylist), mapping = aes(x = hc, y = gdpc)) +
geom_point() +
facet_wrap(~ year) +
scale_y_log10() +
clabs
# Add trend lines
ggplot(data = subset(pwts, year %in% ylist), mapping = aes(x = hc, y = gdpc)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, , se = FALSE) +
facet_wrap(~ year) +
scale_y_log10() +
clabs
# Define labels
clabs <- labs(title = "GDP per capita",
subtitle = "Selected countries",
x = "", y = "Real GDP, PPP, 2011 prices",
caption = "Source: Penn World Tables 9.0")
# Default plot
p1 <- ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
geom_line() +
scale_y_log10() +
clabs
# Minimal theme
p1 + theme_minimal()
# BW theme
p1 + theme_bw()
# Classic theme
p1 + theme_classic()
# Economist theme
p1 + theme_economist()
# WSJ theme
p1 + theme_wsj()
# Default plot
p1 <- ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
geom_line() + theme_classic() + clabs
p1
# Add points
p1 + geom_point()
# Add a trend line
p1 + geom_smooth(method = "lm", formula = y ~ x, se = FALSE)
# Add a quadratic trend line
p1 + geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE)
# Make line thicker
p1 <- p1 + geom_line(size = 1.5)
p1
# Change legend title
p1 <- p1 + scale_color_discrete(name = "Countries")
p1
# Make title larger
p1 <- p1 + theme(plot.title = element_text(size = rel(3)))
p1
# Make subtitle larger
p1 <- p1 + theme(plot.subtitle = element_text(size = rel(2)))
p1
# Make caption left justified and larger
p1 <- p1 + theme(plot.caption = element_text(hjust = 0, size = rel(1.5)))
p1
# Make caption gray
p1 <- p1 + theme(plot.caption = element_text(color = "gray40"))
p1
# Add vertical line for crises years
p1 <- p1 + geom_vline(xintercept = c(1980, 1994, 2001, 2009), size = 4, alpha = 0.25)
p1
# Change legend position
p1 <- p1 + theme(legend.position = c(0.15, 0.75))
p1
# Add text
p1 + annotate("text", x=1980, y=50, label="First crisis")
# Add formula
p1 + annotate("text", x=1965, y=45, parse=TRUE, label="frac(1, sqrt(2 * pi)) * e ^ {-x^2 / 2}")
theme_econ413 <- function (base_size = 12, base_family = "Comic Sans MS") {
theme(rect = element_rect(fill = "#FFFFFF", linetype = 0, colour = NA),
text = element_text(size = base_size, family = base_family),
axis.title.x = element_text(hjust = 0.5, size = rel(0.85)),
axis.title.y = element_text(hjust = 0.5, size = rel(0.85)),
axis.text = element_text(colour = "black"),
axis.line = element_line(colour = "black", size = rel(0.3)),
legend.text = element_text(colour = "black", size= rel(.85)),
panel.grid.major.y = element_line(color = "gray", size = rel(0.3)),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
legend.key = element_rect(fill = "#FFFFFF00"))
}
p1 + theme_econ413()
Hans Rosling’s TED talk on development:
https://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen
or