Data Manipulation

Import, Export, Filter

Pre-class video

Import

data in need: student1.txt student2.txt student3.txt student1.csv student2.csv

# 01 Read and write files #

# If [Enter] is not pressed on the last line of the file
students = read.table("data/students1.txt", header = T, fileEncoding = "CP949", encoding = "UTF-8")

# When [Enter] is pressed on the last line of the file
students = read.table("data/students2.txt", header = T, fileEncoding = "CP949", encoding = "UTF-8")

# Check the structure of the read file
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 95 100

# Read the file as is
students = read.table("data/students1.txt", header = T, as.is = T, fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 95 100

# Disable recognition of sentences as factors when reading a file
students = read.table("data/students1.txt", header = T, stringsAsFactors = F, fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 95 100

# If the delimiter is a comma (,) and the first line is recognized as a header, the file is read as is.
# NA causes math elements to be recognized as sentences
students = read.table("data/students3.txt", sep = ",", header = T, as.is = T, fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : chr  " 100" " 80" " 90" " NA" ...

# Even if you tell the "NA" sentence to be treated as a missing value NA, it will not be processed. This is because a correct sentence requires a space before NA.
students = read.table("data/students3.txt", sep = ",", header = T, as.is = T, na.strings = "NA", fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : chr  " 100" " 80" " 90" " NA" ...

# When entering "NA" correctly, the missing value is treated as NA and all math elements are recognized as numbers.
students = read.table("data/students3.txt", sep = ",", header = T, as.is = T, na.strings = " NA", fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 NA 100

# If you remove the blank space from strip.white, the default value of na.string will be set to "NA", so all math elements will be recognized as numbers.
students = read.table("data/students3.txt", sep = ",", header = T, as.is = T, strip.white = T, fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 NA 100

# Since the first row is the header, there is no need to specify the header option
students = read.csv("data/students.csv", fileEncoding = "CP949", encoding = "UTF-8")
students

    name korean english math
1 강서준    100      90  100
2 김도형     90     100   80
3 박정원     90      95   90
4 이상훈    100      85   95
5 최건우     85     100  100

# Check the structure of the read file
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 95 100

# Change the name attribute from factor to sentence
students$name = as.character(students$name)
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 95 100

# Set sentences not to be recognized as factors when reading a file
students = read.csv("data/students.csv", stringsAsFactors = FALSE, fileEncoding = "CP949", encoding = "UTF-8")
str(students)

'data.frame':   5 obs. of  4 variables:
 $ name   : chr  "강서준" "김도형" "박정원" "이상훈" ...
 $ korean : int  100 90 90 100 85
 $ english: int  90 100 95 85 100
 $ math   : int  100 80 90 95 100

# Double quotes appear in sentences.
# write.table(students, file = "output.txt")

# No double quotes in the sentence.
# write.table(students, file = "output.txt", quote = F)

Filter

# 02 Conditional statements and loop statements for data purification #

test = c(15, 20, 30, NA, 45) # If it is a vector
test[test<40] # extract elements with value less than 40

[1] 15 20 30 NA

test[test%%3!= 0] # Extract elements whose value is not divisible by 3

[1] 20 NA

test[is.na(test)] # Extract elements that are NA

[1] NA

test[!is.na(test)] # Extract non-NA elements

[1] 15 20 30 45

test[test%%2==0 & !is.na(test)] # Extract elements that are multiples of 2 but are not NA

[1] 20 30

characters = data.frame(name = c("Gil-dong", "Chunhyang", "Cheolsu"),
                         age = c(30, 16, 21),
                         gender = factor(c("M", "F","M")))
# In case of data frame

characters

       name age gender
1  Gil-dong  30      M
2 Chunhyang  16      F
3   Cheolsu  21      M

characters[characters$gender =="F",1] # Extract rows where gender is female

[1] "Chunhyang"

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

characters %>% filter(gender=="F") %>% select(name)

       name
1 Chunhyang

characters[characters$age<30 & characters$gender =="M",]

     name age gender
3 Cheolsu  21      M

# Extract rows of men under 30 years old
characters %>% filter(age<30 & gender=="M")

     name age gender
1 Cheolsu  21      M

Class

Before jumping into the class content, any questions? (about preclass, PBL, course, and so on)

Let’s use the List.KMP again

load("data/List_KMP.RData")
names(List.KMP) <- c("p17", "p18", "p19", "d19")

See the structure

str(List.KMP)

List of 4
 $ p17:'data.frame':    10 obs. of  11 variables:
  ..$ pid              : num [1:10] 9920004 12500003 17350004 17670001 23860001 ...
  ..$ age              : num [1:10] 9 26 12 55 70 58 50 68 39 37
  ..$ gender           : Factor w/ 2 levels "Male","Female": 1 1 2 1 1 1 1 1 1 2
  ..$ Mobile.lv        : Factor w/ 5 levels "3G","LTE","LTE-A",..: 5 2 2 2 5 2 2 2 2 3
  ..$ Telecom          : Factor w/ 5 levels "SKT","KT","LG U+",..: 5 2 3 1 5 1 1 1 2 2
  ..$ Smartphone.brand : Factor w/ 8 levels "Samsung","Apple",..: 8 1 1 1 8 1 1 1 1 3
  ..$ sp.mobile        : num [1:10] 0 42 19 38 18 65 32 58 63 54
  ..$ sp.device        : num [1:10] 0 0 0 0 0 0 0 10 0 10
  ..$ sp.online.content: num [1:10] 0 53 19 38 18 65 32 68 63 114
  ..$ sp.offline.contet: num [1:10] 10 22 0 0 0 50 36 0 25 40
  ..$ year             : num [1:10] 2017 2017 2017 2017 2017 ...
 $ p18:'data.frame':    10 obs. of  11 variables:
  ..$ pid              : num [1:10] 9920004 12500003 17350004 17670001 23860001 ...
  ..$ age              : num [1:10] 10 27 13 56 71 59 51 69 40 38
  ..$ gender           : Factor w/ 2 levels "Male","Female": 1 1 2 1 1 1 1 1 1 2
  ..$ Mobile.lv        : Factor w/ 5 levels "3G","LTE","LTE-A",..: 5 3 3 2 5 2 2 2 3 3
  ..$ Telecom          : Factor w/ 5 levels "SKT","KT","LG U+",..: 5 3 3 2 5 3 1 2 1 2
  ..$ Smartphone.brand : Factor w/ 8 levels "Samsung","Apple",..: 8 1 3 1 8 3 1 3 1 3
  ..$ sp.mobile        : num [1:10] 0 90 20 39 30 80 33 36 40 59
  ..$ sp.device        : num [1:10] 0 60 35 0 0 15 0 10 10 12
  ..$ sp.online.content: num [1:10] 0 359 55 39 30 95 33 46 50 71
  ..$ sp.offline.contet: num [1:10] 8 120 0 0 0 12 20 0 0 100
  ..$ year             : num [1:10] 2018 2018 2018 2018 2018 ...
 $ p19:'data.frame':    10 obs. of  11 variables:
  ..$ pid              : num [1:10] 9920004 12500003 17350004 17670001 23860001 ...
  ..$ age              : num [1:10] 11 28 14 57 72 60 52 70 41 39
  ..$ gender           : Factor w/ 2 levels "Male","Female": 1 1 2 1 1 1 1 1 1 2
  ..$ Mobile.lv        : Factor w/ 5 levels "3G","LTE","LTE-A",..: 5 2 3 2 2 2 2 2 3 3
  ..$ Telecom          : Factor w/ 5 levels "SKT","KT","LG U+",..: 5 3 3 1 2 1 1 2 3 3
  ..$ Smartphone.brand : Factor w/ 8 levels "Samsung","Apple",..: 8 2 1 1 1 1 1 3 3 3
  ..$ sp.mobile        : num [1:10] 0 60 21 30 32 78 35 54 73 45
  ..$ sp.device        : num [1:10] 0 30 0 0 0 0 0 0 20 15
  ..$ sp.online.content: num [1:10] 0 90 21 30 32 78 35 54 93 60
  ..$ sp.offline.contet: num [1:10] 21 0 0 20 0 24 90 0 20 60
  ..$ year             : num [1:10] 2019 2019 2019 2019 2019 ...
 $ d19:'data.frame':    10 obs. of  4 variables:
  ..$ pid           : num [1:10] 9920004 12500003 17350004 17670001 23860001 ...
  ..$ Smarphone.time: num [1:10] 0 60 345 90 40 65 170 95 85 75
  ..$ SNS.time      : num [1:10] 0 0 10 0 0 0 0 0 0 0
  ..$ year          : num [1:10] 2019 2019 2019 2019 2019 ...

Did ‘KT’ users in 2017 change their telecom companies in 2018 and 2019?

Extract data.frames from the List
Find the pid(personal ID) whose telecom company was “KT”
Try the code below and explain how it works
Merge the two data.frames above by the key variable “pid”, and change the second and the third column names to “y2018” and “y2019”

Extract data.frames from the List

p17_df <- List.KMP[["p17"]]
p18_df <- List.KMP[["p18"]]
p19_df <- List.KMP[["p19"]]

Find the pid(personal ID) whose telecom company was “KT”

pid_kt_user <- p17_df[p17_df$Telecom=="KT", "pid"]
pid_kt_user

[1] 12500003 59570001 65840001

Try the code below and explain how it works

kt_user_18 <- p18_df[p18_df$pid %in% pid_kt_user, c("pid", "Telecom")]
kt_user_19 <- p19_df[p19_df$pid %in% pid_kt_user, c("pid", "Telecom")]
kt_user_18

        pid Telecom
2  12500003   LG U+
9  59570001     SKT
10 65840001      KT

kt_user_19

        pid Telecom
2  12500003   LG U+
9  59570001   LG U+
10 65840001   LG U+

Merge the two data.frames above by the key variable “pid”, and change the second and the third column names to “y2018” and “y2019”

DF3 <- merge(kt_user_18, kt_user_19, by="pid")
names(DF3)[c(2,3)]<-c("y2018", "y2019")
DF3

       pid y2018 y2019
1 12500003 LG U+ LG U+
2 59570001   SKT LG U+
3 65840001    KT LG U+

What kinds of studies we can do with the analysis above? Let’s think about it.

Download and use this data for you study

You can also download the original questionnaires, codebooks, and user guides documents

Did ‘Samsung’ users in 2017 change their phone brand in 2018 and 2019?

Let’s analyse this using the code we learnt

Practice & QZ (more)

Let’s use ‘airquality’ data stored in base R

airquality

    Ozone Solar.R Wind Temp Month Day
1      41     190  7.4   67     5   1
2      36     118  8.0   72     5   2
3      12     149 12.6   74     5   3
4      18     313 11.5   62     5   4
5      NA      NA 14.3   56     5   5
6      28      NA 14.9   66     5   6
7      23     299  8.6   65     5   7
8      19      99 13.8   59     5   8
9       8      19 20.1   61     5   9
10     NA     194  8.6   69     5  10
11      7      NA  6.9   74     5  11
12     16     256  9.7   69     5  12
13     11     290  9.2   66     5  13
14     14     274 10.9   68     5  14
15     18      65 13.2   58     5  15
16     14     334 11.5   64     5  16
17     34     307 12.0   66     5  17
18      6      78 18.4   57     5  18
19     30     322 11.5   68     5  19
20     11      44  9.7   62     5  20
21      1       8  9.7   59     5  21
22     11     320 16.6   73     5  22
23      4      25  9.7   61     5  23
24     32      92 12.0   61     5  24
25     NA      66 16.6   57     5  25
26     NA     266 14.9   58     5  26
27     NA      NA  8.0   57     5  27
28     23      13 12.0   67     5  28
29     45     252 14.9   81     5  29
30    115     223  5.7   79     5  30
31     37     279  7.4   76     5  31
32     NA     286  8.6   78     6   1
33     NA     287  9.7   74     6   2
34     NA     242 16.1   67     6   3
35     NA     186  9.2   84     6   4
36     NA     220  8.6   85     6   5
37     NA     264 14.3   79     6   6
38     29     127  9.7   82     6   7
39     NA     273  6.9   87     6   8
40     71     291 13.8   90     6   9
41     39     323 11.5   87     6  10
42     NA     259 10.9   93     6  11
43     NA     250  9.2   92     6  12
44     23     148  8.0   82     6  13
45     NA     332 13.8   80     6  14
46     NA     322 11.5   79     6  15
47     21     191 14.9   77     6  16
48     37     284 20.7   72     6  17
49     20      37  9.2   65     6  18
50     12     120 11.5   73     6  19
51     13     137 10.3   76     6  20
52     NA     150  6.3   77     6  21
53     NA      59  1.7   76     6  22
54     NA      91  4.6   76     6  23
55     NA     250  6.3   76     6  24
56     NA     135  8.0   75     6  25
57     NA     127  8.0   78     6  26
58     NA      47 10.3   73     6  27
59     NA      98 11.5   80     6  28
60     NA      31 14.9   77     6  29
61     NA     138  8.0   83     6  30
62    135     269  4.1   84     7   1
63     49     248  9.2   85     7   2
64     32     236  9.2   81     7   3
65     NA     101 10.9   84     7   4
66     64     175  4.6   83     7   5
67     40     314 10.9   83     7   6
68     77     276  5.1   88     7   7
69     97     267  6.3   92     7   8
70     97     272  5.7   92     7   9
71     85     175  7.4   89     7  10
72     NA     139  8.6   82     7  11
73     10     264 14.3   73     7  12
74     27     175 14.9   81     7  13
75     NA     291 14.9   91     7  14
76      7      48 14.3   80     7  15
77     48     260  6.9   81     7  16
78     35     274 10.3   82     7  17
79     61     285  6.3   84     7  18
80     79     187  5.1   87     7  19
81     63     220 11.5   85     7  20
82     16       7  6.9   74     7  21
83     NA     258  9.7   81     7  22
84     NA     295 11.5   82     7  23
85     80     294  8.6   86     7  24
86    108     223  8.0   85     7  25
87     20      81  8.6   82     7  26
88     52      82 12.0   86     7  27
89     82     213  7.4   88     7  28
90     50     275  7.4   86     7  29
91     64     253  7.4   83     7  30
92     59     254  9.2   81     7  31
93     39      83  6.9   81     8   1
94      9      24 13.8   81     8   2
95     16      77  7.4   82     8   3
96     78      NA  6.9   86     8   4
97     35      NA  7.4   85     8   5
98     66      NA  4.6   87     8   6
99    122     255  4.0   89     8   7
100    89     229 10.3   90     8   8
101   110     207  8.0   90     8   9
102    NA     222  8.6   92     8  10
103    NA     137 11.5   86     8  11
104    44     192 11.5   86     8  12
105    28     273 11.5   82     8  13
106    65     157  9.7   80     8  14
107    NA      64 11.5   79     8  15
108    22      71 10.3   77     8  16
109    59      51  6.3   79     8  17
110    23     115  7.4   76     8  18
111    31     244 10.9   78     8  19
112    44     190 10.3   78     8  20
113    21     259 15.5   77     8  21
114     9      36 14.3   72     8  22
115    NA     255 12.6   75     8  23
116    45     212  9.7   79     8  24
117   168     238  3.4   81     8  25
118    73     215  8.0   86     8  26
119    NA     153  5.7   88     8  27
120    76     203  9.7   97     8  28
121   118     225  2.3   94     8  29
122    84     237  6.3   96     8  30
123    85     188  6.3   94     8  31
124    96     167  6.9   91     9   1
125    78     197  5.1   92     9   2
126    73     183  2.8   93     9   3
127    91     189  4.6   93     9   4
128    47      95  7.4   87     9   5
129    32      92 15.5   84     9   6
130    20     252 10.9   80     9   7
131    23     220 10.3   78     9   8
132    21     230 10.9   75     9   9
133    24     259  9.7   73     9  10
134    44     236 14.9   81     9  11
135    21     259 15.5   76     9  12
136    28     238  6.3   77     9  13
137     9      24 10.9   71     9  14
138    13     112 11.5   71     9  15
139    46     237  6.9   78     9  16
140    18     224 13.8   67     9  17
141    13      27 10.3   76     9  18
142    24     238 10.3   68     9  19
143    16     201  8.0   82     9  20
144    13     238 12.6   64     9  21
145    23      14  9.2   71     9  22
146    36     139 10.3   81     9  23
147     7      49 10.3   69     9  24
148    14      20 16.6   63     9  25
149    30     193  6.9   70     9  26
150    NA     145 13.2   77     9  27
151    14     191 14.3   75     9  28
152    18     131  8.0   76     9  29
153    20     223 11.5   68     9  30

Format

A data frame with 153 observations on 6 variables.

`[,1]`	`Ozone`	numeric	Ozone (ppb)
`[,2]`	`Solar.R`	numeric	Solar R (lang)
`[,3]`	`Wind`	numeric	Wind (mph)
`[,4]`	`Temp`	numeric	Temperature (degrees F)
`[,5]`	`Month`	numeric	Month (1–12)

Details

Daily readings of the following air quality values for May 1, 1973 (a Tuesday) to September 30, 1973.

Ozone: Mean ozone in parts per billion from 1300 to 1500 hours at Roosevelt Island
Solar.R: Solar radiation in Langleys in the frequency band 4000–7700 Angstroms from 0800 to 1200 hours at Central Park
Wind: Average wind speed in miles per hour at 0700 and 1000 hours at LaGuardia Airport
Temp: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport.

pairs(airquality, panel = panel.smooth, main = "airquality data")

What are the column names of the data frame?

*Hint: names()
What are the row names of the data frame?

*Hint: rownames()
Extract the first 10 rows in ‘airquality’

*Hint: One way: by using [] / Alternative way: by using head()
Check the number of rows (observations) in the dataset

*Hint: length() or nrow()
How many missing values are in the ‘Ozone’ column ?

*Hint: is.na() and sum()
What is the mean value of the ‘Ozone’ column? (Exclude missing values (coded as NA) from this calculation)

*Hint: Use mean() with an option na.rm = T
Extract the subset where Ozone values are above 31 and Temp values are above 90.

*Hint: Use subset()
Use the apply function to calculate the standard deviation of each column in the data frame

*Hint: Use apply() with an option na.rm = T / Use sd() function
Calculate the mean value of ‘Ozone’ for each Month in the data frame and create a vector containing the monthly means (exclude all missing values)

*Hint: Use tapply() with an option na.rm = T
Draw a random sample of 5 rows from the data frame

*Hint: Use sample(), nrow(), airquality[]