ECON 413
Data types and data objects

Erol Taymaz
Department of Economics
Middle East Technical University

Topics

Use R for

Installing R

Using R

Using R

Objects

Using R

Using R

Using R

Using R

Using R

Using RStudio

RStudio shortcuts

Installing R packages

install.packages(ggplot2)
update.packages()
update.packages(ggplot2)
library(ggplot2)   # Returns error
require(ggplot2)   # Returns warning

R as calculator

R objects

Everything in R is an object

a <- c(1:5)
a
## [1] 1 2 3 4 5
sum(a)
## [1] 15
sum
## function (..., na.rm = FALSE)  .Primitive("sum")
a <- rnorm(100)
b <- a + rnorm(100)
model_1 <- lm(a ~ b)
model_1
## 
## Call:
## lm(formula = a ~ b)
## 
## Coefficients:
## (Intercept)            b  
##     0.01398      0.51570
str(model_1)
## List of 12
##  $ coefficients : Named num [1:2] 0.014 0.516
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "b"
##  $ residuals    : Named num [1:100] 0.1403 -1.1579 -1.0822 0.4535 0.0618 ...
##   ..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:100] 0.093 7.815 -1.181 0.464 0.071 ...
##   ..- attr(*, "names")= chr [1:100] "(Intercept)" "b" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:100] 1.0367 -0.1663 0.682 -0.0744 -0.0659 ...
##   ..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:100, 1:2] -10 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:100] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "b"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.1 1.03
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 98
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = a ~ b)
##  $ terms        :Classes 'terms', 'formula'  language a ~ b
##   .. ..- attr(*, "variables")= language list(a, b)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "a" "b"
##   .. .. .. ..$ : chr "b"
##   .. ..- attr(*, "term.labels")= chr "b"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(a, b)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "a" "b"
##  $ model        :'data.frame':   100 obs. of  2 variables:
##   ..$ a: num [1:100] 1.177 -1.3242 -0.4002 0.3791 -0.0041 ...
##   ..$ b: num [1:100] 1.983 -0.35 1.295 -0.171 -0.155 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language a ~ b
##   .. .. ..- attr(*, "variables")= language list(a, b)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "a" "b"
##   .. .. .. .. ..$ : chr "b"
##   .. .. ..- attr(*, "term.labels")= chr "b"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(a, b)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "a" "b"
##  - attr(*, "class")= chr "lm"
model_1$coefficients
## (Intercept)           b 
##  0.01397907  0.51569589
mean(model_1$residuals)
## [1] -2.029626e-18
summary(a)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -3.004508 -0.649363 -0.063774 -0.009299  0.768086  3.326006
summary(model_1)
## 
## Call:
## lm(formula = a ~ b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28397 -0.31069 -0.00959  0.38310  1.96025 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.01398    0.06674   0.209    0.835    
## b            0.51570    0.04402  11.715   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6671 on 98 degrees of freedom
## Multiple R-squared:  0.5834, Adjusted R-squared:  0.5791 
## F-statistic: 137.2 on 1 and 98 DF,  p-value: < 2.2e-16

Data modes

All objects have a “mode” (type of information). Atomic “modes” are the basic building blocks for data objects in R.There are 6 atomic modes:

a <- 5
mode(a)
## [1] "numeric"
b <- "a"
mode(b)
## [1] "character"
c <- TRUE
mode(c)
## [1] "logical"
A <- c(1:5)
mode(A)
## [1] "numeric"
B <- c("a", "b", "c")
mode(B)
## [1] "character"
C <- c(T, F, T)
mode(C)
## [1] "logical"
D <- c("a", "b", "c", 12, 24)
mode(D)
## [1] "character"
E <- c(3, 5, T, F, T)
mode(E)
## [1] "numeric"
mode(lm)
## [1] "function"
rm(list=ls())

Object class

All objects belong to one or more classes. There is no limit on the number of classes.

The class of an object defines how the object will be treated by functions.

A <- c(1:5)
class(A)
## [1] "integer"
B <- c("a", "b", "c")
class(B)
## [1] "character"
C <- c(T, F, T)
class(C)
## [1] "logical"
D <- c("a", "b", "c", 12, 24)
class(D)
## [1] "character"
E <- c(3, 5, T, F, T)
class(E)
## [1] "numeric"
F <- data.frame(a = c(1:5), b = rnorm(5), d = c("a", "b", "c", "d", "e"))
class(F)
## [1] "data.frame"
class(F$a)
## [1] "integer"
M <- matrix(c(1:15), nrow = 5, ncol = 3, byrow = TRUE)
mode(M)
## [1] "numeric"
class(M)
## [1] "matrix" "array"
M2 <- matrix(c(1:5, "A"), nrow = 3, ncol = 2, byrow = FALSE)
mode(M2)
## [1] "character"
class(M2)
## [1] "matrix" "array"
rm(list=ls())

Special values

a <- c(1, 2)
a
## [1] 1 2
a[3] <- 3
a[4] <- NA
a[5] <- 1 / 0
a[6] <- -1 / 0
a[7] <- 0 / 0

a
## [1]    1    2    3   NA  Inf -Inf  NaN

Data objects

Vectors

c, rep, seq, sample and runif functions

a <- c(1, 2, 4)
b <- c(1L, 2L, 4L)
c <- c(TRUE, FALSE, T, F)
d <- c("This", "That")

class(a)
## [1] "numeric"
str(a)
##  num [1:3] 1 2 4
summary(a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.500   2.000   2.333   3.000   4.000
a <- c(1:5)
b <- c(5:1)
ab <- c(a,b)
a
## [1] 1 2 3 4 5
b
## [1] 5 4 3 2 1
ab
##  [1] 1 2 3 4 5 5 4 3 2 1
a <- rep(c(1:2), times = 5)
b <- rep(c(1:2), each = 3)
d <- rep(c(1:2), times = 2, each = 3)
e <- rep(c(1:2), len = 5)
a
##  [1] 1 2 1 2 1 2 1 2 1 2
b
## [1] 1 1 1 2 2 2
d
##  [1] 1 1 1 2 2 2 1 1 1 2 2 2
e
## [1] 1 2 1 2 1
a <- seq(from = 1, to = 2, by = .1)
a <- seq(1, 2, by = .1)
b <- seq(1, 2, length.out = 7)

a
##  [1] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0
b
## [1] 1.000000 1.166667 1.333333 1.500000 1.666667 1.833333 2.000000
a <- sample(c(1:5), size = 10, replace = TRUE)
b <- sample(c(1:10), size = 5, replace = FALSE)
a
##  [1] 5 5 5 2 2 2 5 4 1 3
b
## [1] 7 1 9 3 5
a <- runif(10)
a
##  [1] 0.45509973 0.12447201 0.18775116 0.07959337 0.49973874 0.98727218
##  [7] 0.52644573 0.76917827 0.68505859 0.13331159
set.seed(123)
a <- runif(10)
a
##  [1] 0.2875775 0.7883051 0.4089769 0.8830174 0.9404673 0.0455565 0.5281055
##  [8] 0.8924190 0.5514350 0.4566147
rm(list=ls())
aa <- vector(mode = "numeric", length = 10)
aa
##  [1] 0 0 0 0 0 0 0 0 0 0
bb <- numeric(length = 10)
bb
##  [1] 0 0 0 0 0 0 0 0 0 0
identical(aa, bb)
## [1] TRUE
cc <- character(length = 10)
cc
##  [1] "" "" "" "" "" "" "" "" "" ""
dd <- as.numeric(cc)
dd
##  [1] NA NA NA NA NA NA NA NA NA NA

Indexing with vectors

a <- c(1, 2, 4)
a[1]
## [1] 1
a[c(1, 4)]
## [1]  1 NA
a[4] <- 40
a[c(1, 4)]
## [1]  1 40
a[10] <- 10
a
##  [1]  1  2  4 40 NA NA NA NA NA 10

Lists

a <- list(a=c(1:10), b = c("a"), c = c(TRUE, FALSE, TRUE, FALSE))
a
## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $b
## [1] "a"
## 
## $c
## [1]  TRUE FALSE  TRUE FALSE
summary(a)
##   Length Class  Mode     
## a 10     -none- numeric  
## b  1     -none- character
## c  4     -none- logical
a[1]
## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
a[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
class(a[1])
## [1] "list"
class(a[[1]])
## [1] "integer"
a[[1]][3]
## [1] 3
a[[3]][1]
## [1] TRUE

Matrices

a <- matrix(1:15, ncol = 3, nrow = 5)
b <- matrix(c("a", "b", "c", "d", "e", "f"), ncol = 3, nrow = 2)

a
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    2    7   12
## [3,]    3    8   13
## [4,]    4    9   14
## [5,]    5   10   15
b
##      [,1] [,2] [,3]
## [1,] "a"  "c"  "e" 
## [2,] "b"  "d"  "f"
class(a)
## [1] "matrix" "array"
str(a)
##  int [1:5, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
summary(a)
##        V1          V2           V3    
##  Min.   :1   Min.   : 6   Min.   :11  
##  1st Qu.:2   1st Qu.: 7   1st Qu.:12  
##  Median :3   Median : 8   Median :13  
##  Mean   :3   Mean   : 8   Mean   :13  
##  3rd Qu.:4   3rd Qu.: 9   3rd Qu.:14  
##  Max.   :5   Max.   :10   Max.   :15
a
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    2    7   12
## [3,]    3    8   13
## [4,]    4    9   14
## [5,]    5   10   15
a[c(1,3),]
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    3    8   13
a[c(1,15)]
## [1]  1 15
a[, 2]
## [1]  6  7  8  9 10
a[c(1,3), 2]
## [1] 6 8

Arrays

An array is a multidimensional object. A matrix is an nxm dimensional array.

aa <- array(c(1:12), dim = c(6, 2))
aa
##      [,1] [,2]
## [1,]    1    7
## [2,]    2    8
## [3,]    3    9
## [4,]    4   10
## [5,]    5   11
## [6,]    6   12
class(aa)
## [1] "matrix" "array"
bb <- array(c(1:24), dim = c(4, 3, 2))
# 2 4x3 matrices 
bb
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]   13   17   21
## [2,]   14   18   22
## [3,]   15   19   23
## [4,]   16   20   24
class(bb)
## [1] "array"
bb[3, 2, 1]
## [1] 7

Data frames

aa <- data.frame(a = 1:4, b = c("a", "b", "c", "d"), 
                 z = c(1, 3, 5, NA))
bb <- data.frame(a = 1, b = c("A", "B", "C", "D"), z = "Z", 
                 stringsAsFactors = FALSE)

aa
##   a b  z
## 1 1 a  1
## 2 2 b  3
## 3 3 c  5
## 4 4 d NA
bb
##   a b z
## 1 1 A Z
## 2 1 B Z
## 3 1 C Z
## 4 1 D Z
class(aa)
## [1] "data.frame"
names(aa)
## [1] "a" "b" "z"
str(aa)
## 'data.frame':    4 obs. of  3 variables:
##  $ a: int  1 2 3 4
##  $ b: chr  "a" "b" "c" "d"
##  $ z: num  1 3 5 NA
summary(aa)
##        a             b                   z    
##  Min.   :1.00   Length:4           Min.   :1  
##  1st Qu.:1.75   Class :character   1st Qu.:2  
##  Median :2.50   Mode  :character   Median :3  
##  Mean   :2.50                      Mean   :3  
##  3rd Qu.:3.25                      3rd Qu.:4  
##  Max.   :4.00                      Max.   :5  
##                                    NA's   :1
cc <- cbind(aa, bb)
cc
##   a b  z a b z
## 1 1 a  1 1 A Z
## 2 2 b  3 1 B Z
## 3 3 c  5 1 C Z
## 4 4 d NA 1 D Z
dd <- rbind(aa, bb)
dd
##   a b    z
## 1 1 a    1
## 2 2 b    3
## 3 3 c    5
## 4 4 d <NA>
## 5 1 A    Z
## 6 1 B    Z
## 7 1 C    Z
## 8 1 D    Z

Use the merge function to merge two data frames

aa$a
## [1] 1 2 3 4
aa[,"a"]
## [1] 1 2 3 4
aa[,1]
## [1] 1 2 3 4
aa["a"]
##   a
## 1 1
## 2 2
## 3 3
## 4 4
aa[["a"]]
## [1] 1 2 3 4
vname <- "a"
aa[, vname]
## [1] 1 2 3 4
aa[vname]
##   a
## 1 1
## 2 2
## 3 3
## 4 4
aa[1,]
##   a b z
## 1 1 a 1
aa[1:2,]
##   a b z
## 1 1 a 1
## 2 2 b 3
aa[c(1,3),]
##   a b z
## 1 1 a 1
## 3 3 c 5
# Do not forget the comma at the end
aa[1:2,2]
## [1] "a" "b"
aa[1:2,"b"]
## [1] "a" "b"
aa["b"][1:2,]
## [1] "a" "b"
aa[1:2,]$b
## [1] "a" "b"
aa$x <- c(4, 8, 1.5, 7)
aa$y <- aa$a * aa$x
aa
##   a b  z   x    y
## 1 1 a  1 4.0  4.0
## 2 2 b  3 8.0 16.0
## 3 3 c  5 1.5  4.5
## 4 4 d NA 7.0 28.0
aa$x <- NULL
aa
##   a b  z    y
## 1 1 a  1  4.0
## 2 2 b  3 16.0
## 3 3 c  5  4.5
## 4 4 d NA 28.0
aa$y <- sqrt(aa$y) 
aa
##   a b  z        y
## 1 1 a  1 2.000000
## 2 2 b  3 4.000000
## 3 3 c  5 2.121320
## 4 4 d NA 5.291503
aa$y[1] <- 10
aa
##   a b  z         y
## 1 1 a  1 10.000000
## 2 2 b  3  4.000000
## 3 3 c  5  2.121320
## 4 4 d NA  5.291503
aa[1,2] <- "aaa"
aa
##   a   b  z         y
## 1 1 aaa  1 10.000000
## 2 2   b  3  4.000000
## 3 3   c  5  2.121320
## 4 4   d NA  5.291503
aa[aa$a > 2, ]
##   a b  z        y
## 3 3 c  5 2.121320
## 4 4 d NA 5.291503
aa[aa$a > 2, c(1:3)]
##   a b  z
## 3 3 c  5
## 4 4 d NA
aa[aa$a < 3 & (aa$b == "aaa" | aa$z == 3), c(1:3)]
##   a   b z
## 1 1 aaa 1
## 2 2   b 3
aa[aa$a < 2, c(1:3)] <- 1000
aa
##      a    b    z         y
## 1 1000 1000 1000 10.000000
## 2    2    b    3  4.000000
## 3    3    c    5  2.121320
## 4    4    d   NA  5.291503
aaa  <- na.omit(aa)
aaa
##      a    b    z        y
## 1 1000 1000 1000 10.00000
## 2    2    b    3  4.00000
## 3    3    c    5  2.12132

Be very careful in using na.omit!

Data frames - wide and long formats

GDP growth rate data

Country 2000 2001 2002 2003 2004
Germany 0.0409 0.0013 0.0126 0.0126 0.0001
Korea 0.0262 0.0232 0.0186 0.0244 0.0146
Turkey 0.0384 0.0011 0.0099 0.0399 0.0316
US 0.0220 0.0107 0.0143 0.0318 0.0274

How many variables are there in the data?

How many observations?

Data frames - wide and long formats

How many variables? 3 variables (country, year, gdp growth rate)

How many observations? 4x5 = 20 observations

GDP growth rate data - long format

Country year gdpgr
Germany 2000 0.0013
Germany 2001 0.0013
Germany 2002 0.0126
Germany 2003 0.0116

Use the reshape function to convert wide-to-long and long-to-wide