# 01 Read and write files ## If [Enter] is not pressed on the last line of the filestudents =read.table("data/students1.txt", header = T, fileEncoding ="CP949", encoding ="UTF-8")# When [Enter] is pressed on the last line of the filestudents =read.table("data/students2.txt", header = T, fileEncoding ="CP949", encoding ="UTF-8")# Check the structure of the read filestr(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 95 100
# Read the file as isstudents =read.table("data/students1.txt", header = T, as.is = T, fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 95 100
# Disable recognition of sentences as factors when reading a filestudents =read.table("data/students1.txt", header = T, stringsAsFactors = F, fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 95 100
# If the delimiter is a comma (,) and the first line is recognized as a header, the file is read as is.# NA causes math elements to be recognized as sentencesstudents =read.table("data/students3.txt", sep =",", header = T, as.is = T, fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : chr " 100" " 80" " 90" " NA" ...
# Even if you tell the "NA" sentence to be treated as a missing value NA, it will not be processed. This is because a correct sentence requires a space before NA.students =read.table("data/students3.txt", sep =",", header = T, as.is = T, na.strings ="NA", fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : chr " 100" " 80" " 90" " NA" ...
# When entering "NA" correctly, the missing value is treated as NA and all math elements are recognized as numbers.students =read.table("data/students3.txt", sep =",", header = T, as.is = T, na.strings =" NA", fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 NA 100
# If you remove the blank space from strip.white, the default value of na.string will be set to "NA", so all math elements will be recognized as numbers.students =read.table("data/students3.txt", sep =",", header = T, as.is = T, strip.white = T, fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 NA 100
# Since the first row is the header, there is no need to specify the header optionstudents =read.csv("data/students.csv", fileEncoding ="CP949", encoding ="UTF-8")students
name korean english math
1 강서준 100 90 100
2 김도형 90 100 80
3 박정원 90 95 90
4 이상훈 100 85 95
5 최건우 85 100 100
# Check the structure of the read filestr(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 95 100
# Change the name attribute from factor to sentencestudents$name =as.character(students$name)str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 95 100
# Set sentences not to be recognized as factors when reading a filestudents =read.csv("data/students.csv", stringsAsFactors =FALSE, fileEncoding ="CP949", encoding ="UTF-8")str(students)
'data.frame': 5 obs. of 4 variables:
$ name : chr "강서준" "김도형" "박정원" "이상훈" ...
$ korean : int 100 90 90 100 85
$ english: int 90 100 95 85 100
$ math : int 100 80 90 95 100
# Double quotes appear in sentences.# write.table(students, file = "output.txt")# No double quotes in the sentence.# write.table(students, file = "output.txt", quote = F)
Filter
# 02 Conditional statements and loop statements for data purification #test =c(15, 20, 30, NA, 45) # If it is a vectortest[test<40] # extract elements with value less than 40
[1] 15 20 30 NA
test[test%%3!=0] # Extract elements whose value is not divisible by 3
[1] 20 NA
test[is.na(test)] # Extract elements that are NA
[1] NA
test[!is.na(test)] # Extract non-NA elements
[1] 15 20 30 45
test[test%%2==0&!is.na(test)] # Extract elements that are multiples of 2 but are not NA
[1] 20 30
characters =data.frame(name =c("Gil-dong", "Chunhyang", "Cheolsu"),age =c(30, 16, 21),gender =factor(c("M", "F","M")))# In case of data framecharacters
name age gender
1 Gil-dong 30 M
2 Chunhyang 16 F
3 Cheolsu 21 M
characters[characters$gender =="F",1] # Extract rows where gender is female
[1] "Chunhyang"
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union