ECON 413
Visualization

Erol Taymaz
Department of Economics
Middle East Technical University

Topics

Data visualization

“The use of computer-generated, interactive, visual representations of data to amplify cognition.”

[Card, Mackinlay, & Shneiderman 1999]

R objects

Refresh our memory: Everything in R is an object

a <- c(1:5)
a
## [1] 1 2 3 4 5
sum(a)
## [1] 15
sum
## function (..., na.rm = FALSE)  .Primitive("sum")
b <- plot(a)

b
## NULL
a <- rnorm(100)
b <- a + rnorm(100)
model_1 <- lm(a ~ b)
model_1
## 
## Call:
## lm(formula = a ~ b)
## 
## Coefficients:
## (Intercept)            b  
##    -0.05349      0.52413
summary(a)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.34140 -0.95159 -0.02138 -0.09988  0.74250  2.49122
summary(model_1)
## 
## Call:
## lm(formula = a ~ b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.77994 -0.59063  0.09254  0.48858  1.57312 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.05349    0.07036   -0.76    0.449    
## b            0.52413    0.04580   11.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7025 on 98 degrees of freedom
## Multiple R-squared:  0.572,  Adjusted R-squared:  0.5676 
## F-statistic:   131 on 1 and 98 DF,  p-value: < 2.2e-16
plot(a)

plot(a, b)

plot(model_1)

Data Science Process

Data Science Process

Data handling

Cleaning the data

Exploratory visualization

Example 1

Load the libraries (install them in not installed)

library(mice)
library(ggplot2)
library(GGally)
library(ggthemes)
library(haven)
library(data.table)

Example 1

Read the dataset

dt <- read.csv("https://raw.githubusercontent.com/etaymaz/econ413/master/Data/003_data.csv")
set.seed(123)
dt <- data.frame(id = c(1:1000), exper = rnorm(1000, 0, 1), 
    gender = c(1:2), educ = sample(5, 1000, replace = TRUE))

dt$b[runif(100)<0.1] <- NA
dt$d[runif(100)<0.1] <- NA

dt <- within(dt, wage <- gender + (gender-1)*educ  + 
               (exper * (gender-1) * (educ/5)) + rnorm(1000, 0, 1))

Example 1

View the data

class(dt)
## [1] "data.frame"
names(dt)
## [1] "id"     "exper"  "gender" "educ"   "wage"
dim(dt)
## [1] 1000    5
str(dt)
## 'data.frame':    1000 obs. of  5 variables:
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ exper : num  -0.87 1.503 0.642 NA -0.712 ...
##  $ gender: int  1 2 1 2 1 2 1 2 1 2 ...
##  $ educ  : int  NA 3 2 NA 3 5 5 2 5 5 ...
##  $ wage  : num  NA 6.5486 0.0909 NA -1.4579 ...
head(dt)
##   id      exper gender educ        wage
## 1  1 -0.8699983      1   NA          NA
## 2  2  1.5026771      2    3  6.54857412
## 3  3  0.6415719      1    2  0.09093394
## 4  4         NA      2   NA          NA
## 5  5 -0.7122940      1    3 -1.45789677
## 6  6  2.0612542      2    5  8.11485313
tail(dt)
##        id       exper gender educ       wage
## 995   995 -0.19416947      1    4 -0.3775560
## 996   996          NA      2    1         NA
## 997   997 -0.51001423      1    1 -0.3303003
## 998   998 -0.19601343      2    5  6.3701062
## 999   999 -0.01743695      1    5  1.1119297
## 1000 1000          NA      2    3         NA
table(dt$educ)
## 
##   1   2   3   4   5 
## 178 179 177 184 182
table(dt$gender, dt$educ)
##    
##       1   2   3   4   5
##   1  90  85  97 100  88
##   2  88  94  80  84  94
xtabs( ~ gender + educ, data = dt)
##       educ
## gender   1   2   3   4   5
##      1  90  85  97 100  88
##      2  88  94  80  84  94

Example 1

Missing values

md.pattern(dt)

##     id gender educ exper wage    
## 780  1      1    1     1    1   0
## 120  1      1    1     0    0   2
## 70   1      1    0     1    0   2
## 30   1      1    0     0    0   3
##      0      0  100   150  220 470

Descriptive statistics

summary(dt)
##        id             exper              gender         educ      
##  Min.   :   1.0   Min.   :-2.64866   Min.   :1.0   Min.   :1.000  
##  1st Qu.: 250.8   1st Qu.:-0.60161   1st Qu.:1.0   1st Qu.:2.000  
##  Median : 500.5   Median : 0.04126   Median :1.5   Median :3.000  
##  Mean   : 500.5   Mean   : 0.04952   Mean   :1.5   Mean   :3.014  
##  3rd Qu.: 750.2   3rd Qu.: 0.69551   3rd Qu.:2.0   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   : 3.38499   Max.   :2.0   Max.   :5.000  
##                   NA's   :150                      NA's   :100    
##       wage        
##  Min.   :-1.8845  
##  1st Qu.: 0.9648  
##  Median : 2.1786  
##  Mean   : 2.8972  
##  3rd Qu.: 4.7516  
##  Max.   :10.8053  
##  NA's   :220
summary(subset(dt, gender == 1))
##        id            exper              gender       educ      
##  Min.   :  1.0   Min.   :-2.64866   Min.   :1   Min.   :1.000  
##  1st Qu.:250.5   1st Qu.:-0.58780   1st Qu.:1   1st Qu.:2.000  
##  Median :500.0   Median : 0.05770   Median :1   Median :3.000  
##  Mean   :500.0   Mean   : 0.05215   Mean   :1   Mean   :3.024  
##  3rd Qu.:749.5   3rd Qu.: 0.66337   3rd Qu.:1   3rd Qu.:4.000  
##  Max.   :999.0   Max.   : 3.38499   Max.   :1   Max.   :5.000  
##                  NA's   :50                     NA's   :40     
##       wage        
##  Min.   :-1.8845  
##  1st Qu.: 0.3509  
##  Median : 1.0158  
##  Mean   : 0.9996  
##  3rd Qu.: 1.6446  
##  Max.   : 3.6328  
##  NA's   :90
summary(subset(dt, gender == 2))
##        id             exper              gender       educ      
##  Min.   :   2.0   Min.   :-2.57297   Min.   :2   Min.   :1.000  
##  1st Qu.: 251.5   1st Qu.:-0.61461   1st Qu.:2   1st Qu.:2.000  
##  Median : 501.0   Median : 0.03077   Median :2   Median :3.000  
##  Mean   : 501.0   Mean   : 0.04656   Mean   :2   Mean   :3.005  
##  3rd Qu.: 750.5   3rd Qu.: 0.74073   3rd Qu.:2   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   : 3.11620   Max.   :2   Max.   :5.000  
##                   NA's   :100                    NA's   :60     
##       wage        
##  Min.   : 0.4935  
##  1st Qu.: 3.6629  
##  Median : 4.8601  
##  Mean   : 4.9998  
##  3rd Qu.: 6.4615  
##  Max.   :10.8053  
##  NA's   :130

Example 1

Correlations

# Correlations - variables 2-4
cor(dt[, 2:5])
##        exper gender educ wage
## exper      1     NA   NA   NA
## gender    NA      1   NA   NA
## educ      NA     NA    1   NA
## wage      NA     NA   NA    1
cor(dt[, 2:5], use = "complete.obs")
##              exper        gender         educ       wage
## exper   1.00000000 -0.0104530077 0.0236341481 0.09952051
## gender -0.01045301  1.0000000000 0.0009589141 0.80603867
## educ    0.02363415  0.0009589141 1.0000000000 0.27049172
## wage    0.09952051  0.8060386732 0.2704917170 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs")
##               exper       gender         educ       wage
## exper   1.000000000 -0.002892561  0.023634148 0.09952051
## gender -0.002892561  1.000000000 -0.006840444 0.80603867
## educ    0.023634148 -0.006840444  1.000000000 0.27049172
## wage    0.099520506  0.806038673  0.270491717 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "kendall")
##              exper       gender         educ       wage
## exper  1.000000000  0.006545028  0.018319032 0.04909647
## gender 0.006545028  1.000000000 -0.005881101 0.67278394
## educ   0.018319032 -0.005881101  1.000000000 0.14863020
## wage   0.049096475  0.672783940  0.148630199 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "spearman")
##              exper       gender         educ       wage
## exper  1.000000000  0.008011278  0.025838438 0.07262175
## gender 0.008011278  1.000000000 -0.006575184 0.82346099
## educ   0.025838438 -0.006575184  1.000000000 0.19371484
## wage   0.072621750  0.823460990  0.193714836 1.00000000

Example 1

Plot all data all together

library(GGally)
ggpairs(dt)

Example 1: Regression model

Determinants of wages

model1 <- lm(wage ~ gender + educ + exper, data = dt)
summary(model1)
## 
## Call:
## lm(formula = wage ~ gender + educ + exper, data = dt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4503 -0.8866  0.0170  0.8475  4.2938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.42967    0.17362 -25.513  < 2e-16 ***
## gender       4.00421    0.09210  43.476  < 2e-16 ***
## educ         0.47069    0.03268  14.401  < 2e-16 ***
## exper        0.26214    0.04788   5.475 5.91e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.284 on 776 degrees of freedom
##   (220 observations deleted due to missingness)
## Multiple R-squared:  0.7328, Adjusted R-squared:  0.7317 
## F-statistic: 709.3 on 3 and 776 DF,  p-value: < 2.2e-16
str(model1)
## List of 13
##  $ coefficients : Named num [1:4] -4.43 4.004 0.471 0.262
##   ..- attr(*, "names")= chr [1:4] "(Intercept)" "gender" "educ" "exper"
##  $ residuals    : Named num [1:780] 1.164 -0.593 -2.258 1.642 1.748 ...
##   ..- attr(*, "names")= chr [1:780] "2" "3" "5" "6" ...
##  $ effects      : Named num [1:780] -80.91 55.79 -18.67 -7.03 1.87 ...
##   ..- attr(*, "names")= chr [1:780] "(Intercept)" "gender" "educ" "exper" ...
##  $ rank         : int 4
##  $ fitted.values: Named num [1:780] 5.385 0.684 0.8 6.473 1.87 ...
##   ..- attr(*, "names")= chr [1:780] "2" "3" "5" "6" ...
##  $ assign       : int [1:4] 0 1 2 3
##  $ qr           :List of 5
##   ..$ qr   : num [1:780, 1:4] -27.9285 0.0358 0.0358 0.0358 0.0358 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:780] "2" "3" "5" "6" ...
##   .. .. ..$ : chr [1:4] "(Intercept)" "gender" "educ" "exper"
##   .. ..- attr(*, "assign")= int [1:4] 0 1 2 3
##   ..$ qraux: num [1:4] 1.04 1.04 1 1.07
##   ..$ pivot: int [1:4] 1 2 3 4
##   ..$ tol  : num 1e-07
##   ..$ rank : int 4
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 776
##  $ na.action    : 'omit' Named int [1:220] 1 4 10 20 27 28 32 39 40 42 ...
##   ..- attr(*, "names")= chr [1:220] "1" "4" "10" "20" ...
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = wage ~ gender + educ + exper, data = dt)
##  $ terms        :Classes 'terms', 'formula'  language wage ~ gender + educ + exper
##   .. ..- attr(*, "variables")= language list(wage, gender, educ, exper)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "wage" "gender" "educ" "exper"
##   .. .. .. ..$ : chr [1:3] "gender" "educ" "exper"
##   .. ..- attr(*, "term.labels")= chr [1:3] "gender" "educ" "exper"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(wage, gender, educ, exper)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:4] "wage" "gender" "educ" "exper"
##  $ model        :'data.frame':   780 obs. of  4 variables:
##   ..$ wage  : num [1:780] 6.5486 0.0909 -1.4579 8.1149 3.6185 ...
##   ..$ gender: int [1:780] 2 1 1 2 1 2 1 1 2 1 ...
##   ..$ educ  : int [1:780] 3 2 3 5 5 2 5 4 3 2 ...
##   ..$ exper : num [1:780] 1.503 0.642 -0.712 2.061 -0.22 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language wage ~ gender + educ + exper
##   .. .. ..- attr(*, "variables")= language list(wage, gender, educ, exper)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "wage" "gender" "educ" "exper"
##   .. .. .. .. ..$ : chr [1:3] "gender" "educ" "exper"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "educ" "exper"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(wage, gender, educ, exper)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "wage" "gender" "educ" "exper"
##   ..- attr(*, "na.action")= 'omit' Named int [1:220] 1 4 10 20 27 28 32 39 40 42 ...
##   .. ..- attr(*, "names")= chr [1:220] "1" "4" "10" "20" ...
##  - attr(*, "class")= chr "lm"

Example 1 - Exploratory visualizations

Plots

ggplot(dt, aes(x = exper)) + geom_histogram()

ggplot(dt, aes(x = exper)) + geom_density()

ggplot(dt, aes(x = exper)) + geom_density() +
 stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col="blue")

ggplot(dt, aes(x = educ)) + geom_histogram()

ggplot(dt, aes(x = educ, y = wage)) + geom_point()

ggplot(dt, aes(x = educ, y = wage)) + geom_point(size=5)

ggplot(dt, aes(x = educ, y = wage)) + geom_point(size=5, alpha=0.1)

ggplot(dt, aes(x = educ, y = wage, color = as.factor(gender))) + geom_point(size=5, alpha=0.1)

# Bivariate plots
ggplot(dt, aes(x = exper, y = wage)) + geom_point()

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + 
    geom_smooth(formula = y ~ x)

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + facet_wrap(~ gender) + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + facet_wrap(gender ~ educ, nrow = 2) + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")

Example 1: Regression model

Determinants of wages

dt$educ <- as.factor(dt$educ)

model2 <- lm(wage ~ educ + exper + educ*exper, data = subset(dt, gender == 1))
summary(model2)
## 
## Call:
## lm(formula = wage ~ educ + exper + educ * exper, data = subset(dt, 
##     gender == 1))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73114 -0.61771 -0.00812  0.62456  2.42288 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.07431    0.10991   9.774   <2e-16 ***
## educ2        0.03364    0.15597   0.216   0.8293    
## educ3       -0.28991    0.15232  -1.903   0.0577 .  
## educ4       -0.22512    0.15384  -1.463   0.1441    
## educ5        0.13026    0.15806   0.824   0.4103    
## exper        0.07463    0.11116   0.671   0.5024    
## educ2:exper -0.17990    0.14946  -1.204   0.2294    
## educ3:exper  0.14172    0.15716   0.902   0.3677    
## educ4:exper -0.02000    0.16431  -0.122   0.9032    
## educ5:exper -0.03407    0.17713  -0.192   0.8476    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9827 on 400 degrees of freedom
##   (90 observations deleted due to missingness)
## Multiple R-squared:  0.03855,    Adjusted R-squared:  0.01692 
## F-statistic: 1.782 on 9 and 400 DF,  p-value: 0.06979
model3 <- lm(wage ~ educ + exper + educ*exper, data = subset(dt, gender == 2))
summary(model3)
## 
## Call:
## lm(formula = wage ~ educ + exper + educ * exper, data = subset(dt, 
##     gender == 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86437 -0.77670 -0.01112  0.75068  2.91145 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.00383    0.12360  24.303  < 2e-16 ***
## educ2        1.05102    0.17146   6.130 2.31e-09 ***
## educ3        2.03922    0.17726  11.504  < 2e-16 ***
## educ4        2.96237    0.17822  16.622  < 2e-16 ***
## educ5        3.87530    0.17191  22.543  < 2e-16 ***
## exper        0.05794    0.11777   0.492 0.623058    
## educ2:exper  0.23215    0.17072   1.360 0.174750    
## educ3:exper  0.61271    0.16890   3.628 0.000327 ***
## educ4:exper  0.86182    0.18209   4.733 3.19e-06 ***
## educ5:exper  1.04711    0.18331   5.712 2.34e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.055 on 360 degrees of freedom
##   (130 observations deleted due to missingness)
## Multiple R-squared:  0.6858, Adjusted R-squared:  0.6779 
## F-statistic: 87.31 on 9 and 360 DF,  p-value: < 2.2e-16

Explanatory visualization

Visualization process 1

Visualization process 2

Visualization process 2

Source: William Playfair’s Time Series of Exports and Imports of Denmark and Norway, Public Domain, https://commons.wikimedia.org/w/index.php?curid=12674977

Visualization process 2

Source: Michael Sandberg’s Data Visualization Blog, https://datavizblog.com/2013/05/26/dataviz-history-charles-minards-flow-map-of-napoleons-russian-campaign-of-1812-part-5/

Visualization process 3

Choose the graphical tool

Source: http://curleylab.psych.columbia.edu/netviz/netviz1.html

Visualization process 4

Design the components

Data visualization checklist

Data visualization checklist

Source: Stephanie Evergreen and Ann K. Emery, Data Visualization Checklist, http://AnnKEmery.com/blog

Visualization examples: Case 1

Bar chart vs pie chart

Visualization examples: Case 2

Breakfast preferences: Case 2

Breakfast preferences: Case 2

Source: Stephanie Evergreen and Ann K. Emery, Data Visualization Checklist, http://AnnKEmery.com/blog

Visualization examples: Case 3

Program evaluation: Case 3

Program evaluation: Case 3

Source: Angie Ficek Data Visualization Techniques: How to Tell Your Story to Make it Stick, http://www.cehd.umn.edu/olpd/mesi/spring/2015/ficek-datavis.pdf

ggplot basics

ggplot grammer

Example 2: ggplot components

geoms

geom’s define chart type. Variables should be map to aesthetics.

scales

Scales control the mapping between data and aesthetics.

Coordinate systems

Coordinate systems adjust the mapping from coordinates to the 2d plane of the computer screen.

Faceting

Facets display subsets of the dataset in different panels.

For more information, see ggplot2 web site: http://docs.ggplot2.org/current/

ggplot: Scatter chart

# Read the data
dt <- read.csv("https://raw.githubusercontent.com/etaymaz/econ413/master/Data/003_data.csv")

# Make it data table
dt <- data.table(dt)

# Scatter chart with 2 variables
ggplot(dt, aes(x = exper, y = wage)) + geom_point()

# Scatter chart with 3 variables - continuous color
ggplot(dt, aes(x = exper, y = wage, color = -educ)) + geom_point()

# Scatter chart with 3 variables - discrete color
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ))) + geom_point()

# Scatter chart with 4 variables
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + geom_point()

# Scatter chart with 4 variables and large shapes
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + 
  geom_point(size=3)

# Scatter chart with 5 variables 
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender), alpha = educ)) + 
  geom_point(size=3)

# Scatter chart with 4 variables and large and transparent shapes
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + 
  geom_point(size=3, alpha = 0.5)

ggplot: Line chart

# Calculate mean values by gender and education
dtm <- dt[, list(mwage = mean(wage, na.rm = T), mexper = mean(exper, na.rm = T)), by = list(gender, educ)]

# Line chart with 3 variables
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line()

# Line chart with 3 variables - thick line
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line(size = 2)

# Line chart with 3 variables with points
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line() + geom_point()

# Line chart with 3 variables with points - point size by mean experience
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line() + geom_point(aes(size = mexper))

ggplot: Bar chart

# Bar chart with 1 variable
ggplot(dt, aes(x = educ)) + geom_bar()

# Bar chart with 2 variables
ggplot(dt, aes(x = educ, weight = wage)) + geom_bar() 

# Bar chart with 3 variables
ggplot(dt, aes(x = educ, weight = wage, fill = as.factor(educ))) + geom_bar() 

# Bar chart with 3 variables
ggplot(dt, aes(x = educ, weight = wage, fill = as.factor(gender))) + geom_bar() 

Example 2

Example 2: Download the data

# Download the data
pwt <- read_stata("http://www.rug.nl/ggdc/docs/pwt90.dta")

# Make the data a data.table object
pwt <- data.table(pwt)

Example 2: Check the variables

# Class
class(pwt)
## [1] "data.table" "data.frame"
# Variable names
names(pwt)
##  [1] "countrycode"   "country"       "currency_unit" "year"         
##  [5] "rgdpe"         "rgdpo"         "pop"           "emp"          
##  [9] "avh"           "hc"            "ccon"          "cda"          
## [13] "cgdpe"         "cgdpo"         "ck"            "ctfp"         
## [17] "cwtfp"         "rgdpna"        "rconna"        "rdana"        
## [21] "rkna"          "rtfpna"        "rwtfpna"       "labsh"        
## [25] "delta"         "xr"            "pl_con"        "pl_da"        
## [29] "pl_gdpo"       "i_cig"         "i_xm"          "i_xr"         
## [33] "i_outlier"     "cor_exp"       "statcap"       "csh_c"        
## [37] "csh_i"         "csh_g"         "csh_x"         "csh_m"        
## [41] "csh_r"         "pl_c"          "pl_i"          "pl_g"         
## [45] "pl_x"          "pl_m"          "pl_k"
# Dimension
dim(pwt)
## [1] 11830    47

Example 2: Keep main variables and check the data

# Select a subset of variables
pwts <- pwt[, list(countrycode, year, rgdpo, pop, emp, avh, hc, rgdpna, rkna)]

# Structure
str(pwts)
## Classes 'data.table' and 'data.frame':   11830 obs. of  9 variables:
##  $ countrycode: chr  "ABW" "ABW" "ABW" "ABW" ...
##   ..- attr(*, "label")= chr "3-letter ISO country code"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ year       : num  1950 1951 1952 1953 1954 ...
##   ..- attr(*, "label")= chr "Year"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rgdpo      : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Output-side real GDP at chained PPPs (in mil. 2011US$)"
##   ..- attr(*, "format.stata")= chr "%14.3g"
##  $ pop        : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Population (in millions)"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ emp        : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Number of persons engaged (in millions)"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ avh        : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Average annual hours worked by persons engaged (source: The Conference Board)"
##   ..- attr(*, "format.stata")= chr "%10.0gc"
##  $ hc         : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Human capital index, see note hc"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rgdpna     : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Real GDP at constant 2011 national prices (in mil. 2011US$)"
##   ..- attr(*, "format.stata")= chr "%14.3g"
##  $ rkna       : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Capital stock at constant 2011 national prices (in mil. 2011US$)"
##   ..- attr(*, "format.stata")= chr "%14.3g"
##  - attr(*, ".internal.selfref")=<externalptr>
# Summary statistics
summary(pwts)
##  countrycode             year          rgdpo               pop           
##  Length:11830       Min.   :1950   Min.   :       1   Min.   :   0.0044  
##  Class :character   1st Qu.:1966   1st Qu.:    6045   1st Qu.:   1.5934  
##  Mode  :character   Median :1982   Median :   24874   Median :   5.9857  
##                     Mean   :1982   Mean   :  250855   Mean   :  30.0906  
##                     3rd Qu.:1998   3rd Qu.:  132475   3rd Qu.:  19.5332  
##                     Max.   :2014   Max.   :17135952   Max.   :1369.4357  
##                                    NA's   :2391       NA's   :2391       
##       emp               avh             hc            rgdpna        
##  Min.   :  0.001   Min.   :1363   Min.   :1.007   Min.   :       8  
##  1st Qu.:  0.942   1st Qu.:1816   1st Qu.:1.408   1st Qu.:    6277  
##  Median :  2.976   Median :1979   Median :1.917   Median :   29303  
##  Mean   : 14.219   Mean   :1996   Mean   :2.033   Mean   :  277098  
##  3rd Qu.:  8.404   3rd Qu.:2177   3rd Qu.:2.609   3rd Qu.:  167537  
##  Max.   :798.368   Max.   :3042   Max.   :3.734   Max.   :17150538  
##  NA's   :3586      NA's   :8511   NA's   :3963    NA's   :2391      
##       rkna         
##  Min.   :      31  
##  1st Qu.:   16958  
##  Median :   85975  
##  Mean   :  899559  
##  3rd Qu.:  454699  
##  Max.   :67590072  
##  NA's   :2421
# Missing observations
md.pattern(pwts)

##      countrycode year rgdpo  pop rgdpna rkna  emp   hc  avh      
## 3289           1    1     1    1      1    1    1    1    1     0
## 4006           1    1     1    1      1    1    1    1    0     1
## 30             1    1     1    1      1    1    1    0    1     1
## 919            1    1     1    1      1    1    1    0    0     2
## 572            1    1     1    1      1    1    0    1    0     2
## 593            1    1     1    1      1    1    0    0    0     3
## 30             1    1     1    1      1    0    0    0    0     4
## 2391           1    1     0    0      0    0    0    0    0     7
##                0    0  2391 2391   2391 2421 3586 3963 8511 25654

Example 2: Exploratory visualization

# Generate GDP per capita variable - 000 USD
pwts[, gdpc := (rgdpo / pop) / 1000]

# Check the data
summary(pwts)
##  countrycode             year          rgdpo               pop           
##  Length:11830       Min.   :1950   Min.   :       1   Min.   :   0.0044  
##  Class :character   1st Qu.:1966   1st Qu.:    6045   1st Qu.:   1.5934  
##  Mode  :character   Median :1982   Median :   24874   Median :   5.9857  
##                     Mean   :1982   Mean   :  250855   Mean   :  30.0906  
##                     3rd Qu.:1998   3rd Qu.:  132475   3rd Qu.:  19.5332  
##                     Max.   :2014   Max.   :17135952   Max.   :1369.4357  
##                                    NA's   :2391       NA's   :2391       
##       emp               avh             hc            rgdpna        
##  Min.   :  0.001   Min.   :1363   Min.   :1.007   Min.   :       8  
##  1st Qu.:  0.942   1st Qu.:1816   1st Qu.:1.408   1st Qu.:    6277  
##  Median :  2.976   Median :1979   Median :1.917   Median :   29303  
##  Mean   : 14.219   Mean   :1996   Mean   :2.033   Mean   :  277098  
##  3rd Qu.:  8.404   3rd Qu.:2177   3rd Qu.:2.609   3rd Qu.:  167537  
##  Max.   :798.368   Max.   :3042   Max.   :3.734   Max.   :17150538  
##  NA's   :3586      NA's   :8511   NA's   :3963    NA's   :2391      
##       rkna               gdpc         
##  Min.   :      31   Min.   :   0.133  
##  1st Qu.:   16958   1st Qu.:   2.019  
##  Median :   85975   Median :   5.410  
##  Mean   :  899559   Mean   :  19.478  
##  3rd Qu.:  454699   3rd Qu.:  13.703  
##  Max.   :67590072   Max.   :4447.840  
##  NA's   :2421       NA's   :2391
# Find the outlier
pwts[gdpc > 1000]
##     countrycode year     rgdpo      pop       emp avh hc   rgdpna
##  1:         BMU 1970 118924.79 0.052286        NA  NA NA 1634.187
##  2:         BMU 1971 214744.45 0.052857        NA  NA NA 1690.362
##  3:         BMU 1972 168675.36 0.053426        NA  NA NA 1721.003
##  4:         BMU 1973 163793.48 0.053980        NA  NA NA 1746.537
##  5:         BMU 1974 148884.08 0.054507        NA  NA NA 1761.858
##  6:         BMU 1975 158232.27 0.054994        NA  NA NA 1828.246
##  7:         BMU 1976 194509.84 0.055439        NA  NA NA 1991.665
##  8:         BMU 1977 182189.16 0.055851        NA  NA NA 2042.242
##  9:         BMU 1978 179753.22 0.056237        NA  NA NA 2083.601
## 10:         BMU 1979 180668.77 0.056613        NA  NA NA 2220.414
## 11:         BMU 1980 154140.38 0.056990        NA  NA NA 2304.830
## 12:         BMU 1981 150503.89 0.057370        NA  NA NA 2223.688
## 13:         BMU 1982 141148.67 0.057752        NA  NA NA 2225.750
## 14:         BMU 1983 146613.58 0.058138        NA  NA NA 2245.035
## 15:         BMU 1984 155851.80 0.058528        NA  NA NA 2199.310
## 16:         BMU 1985 119540.04 0.058923        NA  NA NA 2292.701
## 17:         BMU 1986 169751.81 0.059324 0.0334450  NA NA 2394.340
## 18:         BMU 1987 265669.47 0.059730 0.0352440  NA NA 2489.551
## 19:         BMU 1988 182990.47 0.060138 0.0364200  NA NA 2522.420
## 20:         BMU 1989  96493.66 0.060539 0.0362370  NA NA 2525.573
## 21:         BMU 1990 168599.41 0.060931 0.0355730  NA NA 2458.744
## 22:         BMU 1991 179122.67 0.061311 0.0346210  NA NA 2465.172
## 23:         BMU 1992 146723.64 0.061680 0.0336500  NA NA 2397.979
## 24:         BMU 1993 156847.22 0.062036 0.0334270  NA NA 2456.076
## 25:         BMU 1995 135433.84 0.062694 0.0341330  NA NA 2629.517
## 26:         BMU 1996 113023.91 0.062989 0.0346330  NA NA 2731.399
## 27:         BMU 1997 183042.34 0.063257 0.0352960  NA NA 2857.200
## 28:         BMU 2000  80290.86 0.064035 0.0374475  NA NA 3161.136
## 29:         BMU 2002 134871.80 0.064614 0.0378150  NA NA 3339.322
##     countrycode year     rgdpo      pop       emp avh hc   rgdpna
##          rkna     gdpc
##  1:  5616.710 2274.505
##  2:  5655.335 4062.744
##  3:  5728.377 3157.177
##  4:  5789.289 3034.337
##  5:  5849.582 2731.467
##  6:  5927.734 2877.264
##  7:  5968.023 3508.538
##  8:  6053.705 3262.057
##  9:  6163.846 3196.351
## 10:  6307.740 3191.295
## 11:  6537.652 2704.692
## 12:  6706.583 2623.390
## 13:  6959.545 2444.048
## 14:  7217.527 2521.820
## 15:  7533.817 2662.859
## 16:  7784.550 2028.750
## 17:  7934.764 2861.436
## 18:  8121.300 4447.840
## 19:  8423.078 3042.843
## 20:  8607.367 1593.909
## 21:  8666.035 2767.055
## 22:  8659.007 2921.542
## 23:  8664.764 2378.788
## 24:  8685.491 2528.326
## 25:  8728.345 2160.236
## 26:  8880.891 1794.344
## 27:  9099.609 2893.630
## 28: 10313.338 1253.859
## 29: 10428.872 2087.346
##          rkna     gdpc
# Drop the outlier
pwts <- pwts[!countrycode == "BMU"]

# Plot the GDP per capita data for all countries
# Define labels
clabs <-   labs(title = "GDP per capita",
                subtitle = "All countries",
                x = "", y = "Real GDP, PPP, 2011 prices",
                caption = "Source: Penn World Tables 9.0")

ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  clabs

# Remove legends
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  theme(legend.position="none") +
  clabs

# Makegdpc logarithmic
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  theme(legend.position="none") +
  scale_y_log10() +
  clabs

# Plot for a set of countries
clist <- c("TUR", "USA", "IND", "CHN", "KOR", "ESP")
pwtc <- subset(pwts, countrycode %in% clist)

ggplot(data = pwtc, mapping = aes(x = year, y = gdpc)) +
  geom_line() + 
  scale_y_log10() +
  facet_wrap(~ countrycode, nrow = 3, ncol = 3) +
  clabs

ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  scale_y_log10() +
  clabs

Example 2: Scatter plot

# Labels
clabs <- labs(title = "GDP-human capital relation",
              subtitle = "All countries",
              x = "Human capital index", y = "Real GDP, PPP, 2011 prices",
              caption = "Source: Penn World Tables 9.0")

ggplot(data = pwts, mapping = aes(x = hc, y = gdpc, color = -year)) +
  geom_point() + 
  scale_y_log10() +
  clabs

# GDP per capita and education - selected years
ylist <- seq(1950, 2010, by = 10)

ggplot(data = subset(pwts, year %in% ylist), mapping = aes(x = hc, y = gdpc)) +
  geom_point() + 
  facet_wrap(~ year) +
  scale_y_log10() +
  clabs

# Add trend lines
ggplot(data = subset(pwts, year %in% ylist), mapping = aes(x = hc, y = gdpc)) +
  geom_point() + 
  geom_smooth(method = "lm", formula = y ~ x, , se = FALSE) + 
  facet_wrap(~ year) +
  scale_y_log10() +
  clabs

Example 2: Themes

# Define labels
clabs <-   labs(title = "GDP per capita",
                subtitle = "Selected countries",
                x = "", y = "Real GDP, PPP, 2011 prices",
                caption = "Source: Penn World Tables 9.0")

# Default plot
p1 <- ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  scale_y_log10() +
  clabs

# Minimal theme
p1 + theme_minimal()

# BW theme
p1 + theme_bw()

# Classic theme
p1 + theme_classic()

# Economist theme
p1 + theme_economist()

# WSJ theme
p1 + theme_wsj()

Example 2: Changing themes

Example 2: Changing themes

# Default plot
p1 <- ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + theme_classic() + clabs
p1

# Add points
p1 + geom_point()

# Add a trend line
p1 + geom_smooth(method = "lm", formula = y ~ x, se = FALSE)

# Add a quadratic trend line
p1 + geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE)

# Make line thicker
p1 <- p1 + geom_line(size = 1.5)
p1

# Change legend title
p1 <- p1 + scale_color_discrete(name = "Countries") 
p1

# Make title larger
p1 <- p1 +  theme(plot.title = element_text(size = rel(3)))
p1

# Make subtitle larger
p1 <- p1 +  theme(plot.subtitle = element_text(size = rel(2)))
p1

# Make caption left justified and larger
p1 <- p1 +  theme(plot.caption = element_text(hjust = 0, size = rel(1.5)))
p1

# Make caption gray
p1 <- p1 +  theme(plot.caption = element_text(color = "gray40"))
p1

# Add vertical line for crises years
p1 <- p1 +  geom_vline(xintercept = c(1980, 1994, 2001, 2009), size = 4, alpha = 0.25) 
p1

# Change legend position
p1 <- p1 +  theme(legend.position = c(0.15, 0.75))
p1

# Add text
p1 + annotate("text", x=1980, y=50, label="First crisis") 

# Add formula
p1 + annotate("text", x=1965, y=45, parse=TRUE, label="frac(1, sqrt(2 * pi)) * e ^ {-x^2 / 2}") 

Example 2: Make your own theme

theme_econ413 <- function (base_size = 12, base_family = "Comic Sans MS") {
  theme(rect = element_rect(fill = "#FFFFFF", linetype = 0, colour = NA), 
        text = element_text(size = base_size, family = base_family), 
        axis.title.x = element_text(hjust = 0.5, size = rel(0.85)), 
        axis.title.y = element_text(hjust = 0.5, size = rel(0.85)), 
        axis.text = element_text(colour = "black"), 
        axis.line = element_line(colour = "black", size = rel(0.3)), 
        legend.text = element_text(colour = "black", size= rel(.85)), 
        panel.grid.major.y = element_line(color = "gray", size = rel(0.3)), 
        panel.grid.major.x = element_blank(), 
        panel.grid.minor.y = element_blank(), 
        panel.grid.minor.x = element_blank(), 
        panel.border = element_blank(), 
        panel.background = element_blank(), 
        legend.key = element_rect(fill = "#FFFFFF00"))
}

p1 + theme_econ413()

Must see: The best stats you’ve ever seen

Hans Rosling’s TED talk on development:

https://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen

or

https://www.youtube.com/watch?v=hVimVzgtD6w

Learning outcomes