#good habit for data science education
?plot
help(plot)
#get used to data
plot(iris)
str(iris)
tips=read.csv('http://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')
str(tips)
head(tips)
tail(tips)
summary(tips)
install.packages("dplyr") #now you can use '%>%' !!!
install.packages("ggplot2") # visualization tool
library(dplyr)
library(ggplot2)
tips%>%ggplot(aes(size)) + geom_histogram()
tips%>%ggplot(aes(total_bill, tip)) + geom_point()
tips%>%ggplot(aes(total_bill, tip)) + geom_point(aes(col=day)) #by color
tips%>%ggplot(aes(total_bill, tip)) + geom_point(aes(col=day, pch=sex), size = 2)
##chapter03_data type and calculation
# variable name
1a = 2
@a = 2
if = 1
#### R data type ####
# number type : int(integer), num(numeric, jungsu), cplx(complex;i)
# chracter : chr
# level : factor
# logic : True, False
# etc : NULL(not defined), NA(Not Available), Inf(infinite), NaN(Not a number)
#what data type it is?
# class()
# typeof() vv
# is.integer(), is.numeric(), is.complex, is.na()
#datatype transform function
# as.factor(), as.integerm, as.numeric(), as.character(), as.matrix(), as.array()
#R data structure
# atomic vector
# list
# matrix
# data frame
# factors
a <- 1:7 #python range(1,7)
a
x+y=z #is error vvvvvvvvvvv
x+y->z #complete
# Q. what is difference between "=" and "->" ?
# A. "->" is priority than "="
a <- b = 10 # fail
a = b <- 10 # Complete
# cf) <<-, ->>
y = function() {dd <-2}
y()
dd #Error : object 'dd' not found
y = function() {cc <<-2}
y()
cc # complete
#Q. what is difference between factor and character?
#A. Factors are stored as numbers and a table of levels
# storing it as a factor may save lots of memory
# The main difference is that factors have "predefined levels"
##### 00.basic function ####
#operation
# +, -, /, ^ or **, %%, %/%
# >, <, !=, ==, >=, <=, |(or), ||(union), &(and), &&(intersection), is True()
x=T
!x
y=F
isTRUE(x)
isFALSE(y)
3 == 4 | 3 #or ?
x | y
3 == 4 & 3 #and
x & y
##### 01.vector ####
a=vector(length=5)
a # [1] FALSE FALSE FALSE FALSE FALSE
b <- c(1:7)
c <- seq(1, 10, 2) #1 3 5 7 9
c <- seq(0,2,length.out=11) #[1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0
d <- rep(c(1:3), 2) #1 2 3 1 2 3
d <- rep(c(1:3), each = 2) #1 1 2 2 3 3
# add
x=c(x, c(1:3))
x
#vector operation
x = c(2,4,6,8)
x[1] #2
length(x) #4
x[-c(1,2)] #6 8 caution : [x], x is index!
x[c(F,T,T,F)] # 4 6
x[x > 7] # 8
# vector + vwctor
x=c(1,2,3)
y=c(1:6) #1 2 3 4 5 6
z=c(1,5)
x+y # 2 4 6 5 7 9
x+z #longer object length is "not a multiple" of shorter object length
# +)
a <- c(1:5)
all(x>3) # FALSE
any(x>4) # TRUE
#Union, intersect, setdiff, setequal_vector
x<-c(3:5)
y<-c(4:7)
union(x,y) #3 4 5 6 7
intersect(x, y) #4 5
setdiff(x, y) #3
setdiff(y, x) #6 7
setequal(x,y) False
#Union_data frame
df1 = data.frame(CustomerId = c(1:6), Product = c(rep("Oven", 3), rep("Television", 3)))
df2 = data.frame(CustomerId = c(4:7), Product = c(rep("Television", 2), rep("Air conditioner", 2)))
df1
df2
# 1)
union(df1, df2)
# 2)
df_union1<-merge(df1,df2,all=TRUE)
df_union1
# 3)
df_cat<-rbind(df1,df2)
df_union <- unique(df_cat)
df_union
#example : x=c(1:5)+ c(6:10) -> x=c(1:10)
x=c(1:5)
x=c(x, c(6:10))
x
#example : x=c(1:10) -> c(2, 4, 6, 8)
x=c(1:10)
x=x[c(seq(2,9,2))]
##### 02.array ####
# 01) array(data, c(x,y))
# 02) matrix(data, nrow=?, byrow=T/F)
# 03) rbind, cbind
# 01) array
x = array(1:5, c(2,4))
x[,2]
x[1,]
# array name
name = list(c('1row', '2row'), c('1col', '2col', '3col', '4col'))
x = array(1:5, c(2,4), dimnames = name)
x['1row',]
x[,'2col']
# 02) matrix
x = 1:12
matrix(x, nrow=3)
matrix(x, nrow=3, byrow=T)
matrix(x, nrow=3, byrow=T, dimnames = list(c('1r', '2r', '3r'), c('1c', '2c', '3c', '4c')))
# 03) cbind, rbind : vector + vector ... -> array!
v1 = c(1:4)
v2 = c(5:8)
v3 = c(9:12)
rbind(v1, v2, v3)
cbind(v1, v2, v3)
# array operation
# +,- : array *, -
# * : row *
# %*% : mathmatic array *
# t(), aperm()
# solve()
# det()
x <- array(1:4, dim=c(2,2))
"[,1] [,2]
[1,] 1 3
[2,] 2 4"
y <- array(5:8, dim=c(2,2))
"[,1] [,2]
[1,] 5 7
[2,] 6 8"
x + y
x - y
x * y
x / y
x %*% y
t(x)
solve(x)
det(x)
###### array_function #####
a = array(1:12, c(3,4))
a
# apply
apply(a, 1, mean)
apply(a, 2, mean)
apply(a, 1, sum)
apply(a, 1, min)
apply(a, 1, max)
# dim
dim(a) # 3 4
# sample
sample(a) # 10 5 1 4 9 2 12 6 11 7 3 8
sample(a, 10, prob = c(1:12)/24) #different extraction probability
sample(5) # 5 2 1 3 4
#example : titanic
str(Titanic)
he4ad(Titanic)
plot(Titanic)
# 2
mosaicplot(Titanic, main = "Survival on the Titanic", color = TRUE)
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)
# example
x = array(1:14, c(4,6))
rbind(x[1,],x[3,])
# 2
x=array(1:24, c(4,6))
x= x[,seq(1,6, 2)]
x
##### 03.dataframe ####
# array : equal data type =/= df: differ data type
# list : row&column =/= number of variable ==/== df : row&column == number of variable
# df : can use $ =/= matrix : can't
# make dataframe
name = c('Tom', 'Piter', 'Maria')
age = c(22, 20, 25)
gender = factor(c('M', 'M', 'F'))
blood.type = factor(c('A', 'O', 'B'))
patients = data.frame(name, age, gender, blood.type)
patients
# dataframe indexing
patients[,1]
patients$name
patients[3,1]
patients[patients$name == 'Tom', ]
patients[patients$name == 'Tom', c('name', 'age')]
###### dataframe function #####
head(cars)
str(cars)
'$ speed: num 4 4 7 7 8 9 10 10 10 11 ...
$ dist : num 2 10 4 22 16 10 18 26 34 17 ...'
speed #Error: object 'speed' not found
attach(cars)
speed # 4 4 7 7 8 9 10 10 10 11 11 12 12 12 12 13 13 13 13 14 14....
detach(cars)
speed #Error: object 'speed' not found
# +) with
mean(cars$speed) # 15.4
max(cars$speed) # 25
with(cars, max(speed)) # 25
with(cars, mean(speed)) # 15.4
# +) subset
subset(cars, speed > 20)
subset(cars, speed > 20, select = c(dist)) # only distance
subset(cars, speed > 20, select = -c(dist))
# na.omit
airquality
head(airquality)
' Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6'
head(na.omit(airquality))
' Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4'
# merge(x, y, by=inter)
?merge
patients1 = data.frame(name = c('Tom', 'Piter', 'Maria'),
age = c(22, 20, 25))
patients2 = data.frame(name = c('Tom', 'Piter', 'Maria'),
gender = factor(c('M', 'M', 'F')),
blood.type = factor(c('A', 'O', 'B')))
merge(patients1, patients2, by='name')
' name age gender blood.type
1 Maria 25 F B
2 Piter 20 M O
3 Tom 22 M A'
# each of us have different name of similar data
patients1 = data.frame(name1 = c('Tom', 'Piter', 'Maria'),
age = c(22, 20, 25),
gender = factor(c('M', 'M', 'F')))
patients2 = data.frame(name2 = c('Tom', 'Piter', 'Gibs'),
blood.type = factor(c('A', 'AB','O')))
merge(patients1, patients2, by.x = 'name1', by.y='name2')
merge(patients1, patients2, by.x = 'name1', by.y='name2', all=T)
#etc
is.data.frame()
as.data.frame()
x = array(1:12, c(3,4))
x = as.data.frame(x)
is.data.frame(x)
names(x) = c('1st', '2nd', '3rd','4th')
x
#example
aq = airquality
aqo<-na.omit(aq$Ozone)
mean(aqo)
##### 04. list #####
patients = data.frame(name=c('AA', 'BB', 'CC'), age = c(22, 20, 25),
gender=c('M', 'F', 'M'), blood.factor=factor(c('A', 'O', 'B')))
no.patients = data.frame(day=c(1:6), no=c(50, 60, 55, 52, 65, 58))
listPatients = list(patients = patients, no.patients = no.patients)
listPatients
#list indexing
listPatients[[1]]
mean(iris$Petal.Width[iris$Species=='setosa'])
mean(iris[iris$Species=='setosa',]$Petal.Width)
#[[]] vs []
test <- list(a='foo', b=c(77, 33, 2), c=c('foo', 'bar'))
test[2]
# $b
# [1] 77 33 2'
test[[2]] # [1] 77 33 2
test[[2]][1] # [1] 77
###### list function #####
# lapply = list + apply
# sapply = simple + apply
lapply(listPatients$no.patients, mean)
' $day
[1] 3.5
$no
[1] 56.66667'
lapply(listPatients$patients, mean)
' $name
[1] NA
$age
[1] 22.33333
$gender
[1] NA
$blood.factor
[1] NA
Warning messages:
1: In mean.default(X[[i]], ...) :
인자가 수치형 또는 논리형이 아니므로 NA를 반환합니다
2: In mean.default(X[[i]], ...) :
인자가 수치형 또는 논리형이 아니므로 NA를 반환합니다
3: In mean.default(X[[i]], ...) :
인자가 수치형 또는 논리형이 아니므로 NA를 반환합니다'
sapply(listPatients$no.patients, mean)
' day no
3.50000 56.66667 '
sapply(listPatients$no.patients, mean, simplify=F) # not simple
#list addition
room=30
listPatients[['room']] = 30
listPatients
#list eliminate
listPatients$room = NULL
listPatients
#total function ..
# a=c(), array(1:12, c(3,4)), matrix(1:12, nrow=3), apply(array, 1, mean), all(x>5), any(x>5)
# seq(0, 2, length.out=11),rep( 1:3, each=3), setequal(x,y), cbind(v1, v2, v3), sample(z, 10,prob=),setdiff(x, y)
# dim(x), head, length,
# attach(cars), detach(cars), edit(df),with(cars, mean(speed)),subset(cars, speed>20, select=c(dist))
# na.omit(A), merge(A, B, by="name")
# list(patients=patients, no.patients=no.patients)
#lapply, sapply,is.data.frame
#### chapter example Questions ####
# 1)
x <- seq(3,100,3)
y <- seq(4,100,4)
# 2)
sum(intersect(x, y))
# 3)
?airquality #New York
# 4) degrees F
# 5)
aq = airquality
aq[aq$Wind == max(aq$Wind), 'Day']
# 2
max(airquality$Temp)
subset(airquality, airquality$Temp==97, select=c(Month, Day))
# 6)
?quakes # Fiji
# 7)
max(quakes$mag) # 6.4
# 8)
attach(cars)
max(cars[speed>=18 & speed <=20, ]$dist)
detach(cars)
# KOBIC Question
# 1)
gender = c("M", "M", "F", "F", "M", "M")
heights= c(178,172,161,158,182,170)
heights[gender=="F"]
heights[gender=="M"]
#even though they are not in same list, but calculation is possible between them
mean(heights[gender=="F"])
mean(heights[gender=="M"])
tapply(heights, gender, mean)#Vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
# 3)
cities=c("New York,NY", "Ann Arbor,MI", "Chicago,IL")
CT=strsplit(cities, ',')
CT
' [[1]]
[1] "New York" "NY"
[[2]]
[1] "Ann Arbor" "MI"
[[3]]
[1] "Chicago" "IL" '
print(CT)
class(CT) #list
CCT=unlist(CT) # ouput => vector
class(CCT) #"character"
#https://bio-kcs.tistory.com/
#save complete
반응형