Genome Informatics Laboratory at KRIBB

.Platform
setwd('/BiO/example/dataset')
raw_dat = read.csv(file="Ex_data.csv",
                   header=T,stringsAsFactors=F)
head(raw_dat[,1:20]) # 열의수가 많아 1:20의 열만 선택
dim(raw_dat)

# gl ( group의 수, group의 크기 )
# 1,2,group의 수 해당하는 Factor 형 벡터 생성
gr_ind = gl(2, 221) # 1 x 221, 2 x 221 = c(1,...,1,2,...,2)

dat_mat <- t(as.matrix(raw_dat[,-1]))
dim(dat_mat)
rownames(dat_mat) <- paste0("S",1:nrow(dat_mat))
colnames(dat_mat) <- raw_dat[,1]
head(dat_mat[,1:20])

indx <- which(is.na(dat_mat),T)
indx
col_ind = indx[,2]
col_m = apply(dat_mat[,col_ind],2,mean,na.rm=T)
col_m
dat_mat[indx] = col_m
sum(is.na(dat_mat))
dim(dat_mat)

uq_names <- unique(colnames(dat_mat))
p <- length(uq_names) ; n <- dim(dat_mat)[1]
expr_dat <- matrix(0,n,p)
for(i in 1:p) {
  expr_dat[,i] = apply(as.matrix(
    dat_mat[,colnames(dat_mat)==uq_names[i]]),1,mean)
  cat('\n',i,'-th step') }
colnames(expr_dat) <- uq_names
rownames(expr_dat) <- rownames(dat_mat)
head(expr_dat[,1:20])
dim(expr_dat); sum(is.na(expr_dat))

#Set working directory
setwd('./')
# Read a dataset
dat = read.table("Ex211.txt",header=T)
head(dat)
dat$Job
attach(dat)
Job
plot(Job, main="직업의 막대그림", ylab="인원수(명)", ylim=c(0,15))
box()
freq = table(dat[,6])
barplot(freq)


data(VADeaths)
library(gplots)
#x11()
barplot2(VADeaths, beside = TRUE, 
         col = gray(seq(0.4,0.9,length=5)),
         legend = rownames(VADeaths),
         ylim = c(0, 100))
title(main = "Death Rates in Virginia",font.main = 4)

hh <- t(VADeaths)[,5:1]
mybarcol <- "gray20"
ci.l <- hh * 0.85
ci.u <- hh * 1.15
mp <- barplot2(hh, beside = TRUE, 
      col = gray(seq(0.4,0.9,length=5)), 
      legend = colnames(VADeaths), 
      ylim = c(0, 100), 
      main = "Death Rates in Virginia", 
      font.main = 4,
      sub = "Faked 95 percent error bars", 
      cex.names = 1.5, 
      plot.ci = TRUE, 
      ci.l = ci.l, ci.u = ci.u, plot.grid = TRUE)
box()



#x11()
pie(freq, main="직업의 원그림")
pie(rep(1, 24), col = rainbow(24), radius = 0.9)
pie.sales <- c(0.12, 0.3, 0.26, 0.16, 0.04, 0.12)
lbl =  c("Blueberry", "Cherry",
         "Apple", "Boston Cream", "Other", "Vanilla Cream")
names(pie.sales) = paste0(lbl," (",pie.sales*100,"%)")
pie(pie.sales, col=rainbow(length(pie.sales)))


# histogram
x <- expr_dat[,10]
#x11()
hist(x,breaks= 20,col="gray",main=uq_names[10])
hist(x,breaks= 40,freq=F,col="lightblue",main=uq_names[10])
hist(x,breaks= 40,plot=F)


#boxplot
mat = expr_dat[,c(3,4,7,8)]
#x11() # 비어있는 그림 창 생성
boxplot(mat)
res = boxplot(mat,plot=F)
res

#x11()
c_name = colnames(expr_dat)
plot(expr_dat[,1],expr_dat[,2],type='l',xlab=c_name[1],ylab=c_name[2])
#windows()
plot(expr_dat[,3],expr_dat[,4],type='l',xlab=c_name[1],ylab=c_name[2])

# plot 
pop_dat = read.csv(file='table_2_2.csv')
#x11()
plot(pop_dat[,1],pop_dat[,2],type='l',xlab='연도',ylab='인구수')
#windows()
plot(pop_dat[,1],pop_dat[,2],type='b',xlab='연도',ylab='인구수')

# scatter plot
#x11()
# pch는 점의 모양 선택, (e.g., pch=16 => 채워진 원)
ind1 = 8; ind2=12
plot(expr_dat[,ind1],expr_dat[,ind2],type='p',pch=16,
     xlab=uq_names[ind1],ylab=uq_names[ind2])
cor_mat = cor(expr_dat)
which.max(cor_mat[ind1,-ind1])
ind1 = 8; ind2=200
plot(expr_dat[,ind1],expr_dat[,ind2],type='p',pch=16,
     xlab=uq_names[ind1],ylab=uq_names[ind2])


#x11()
#pairs example
ind = c(2,8,12,200)
pairs(expr_dat[,ind])
pairs(expr_dat[,ind], "Expression Data",
      pch = 21, bg = c("red", "blue")[gr_ind])


# Stat
mean(expr_dat[,10])
median(expr_dat[,10])

x = c(1,2,3,1,2,5,5,3,3,3,2)
tb_x = table(x); tb_x
as.numeric(names(tb_x)[which.max(tb_x)])
Mode = function(vec) {
  tb = table(vec)
  return(as.numeric(names(tb)[which.max(tb)])) }
Mode(x)

quantile(expr_dat[,1],0.25)
quantile(expr_dat[,1],c(0.25,0.5,0.75))

min(expr_dat[,1])
max(expr_dat[,1])
range(expr_dat[,1])

x <- rnorm(100)
summary(x) # 수치형 자료의 summary 
y <- c('red','blue','red','white')
summary(y) # 문자형 자료의 summary
f.y <- factor(y); summary(f.y) # 요인의 summary

var(expr_dat[,1])
sum((expr_dat[,1]-mean(expr_dat[,1]))^2)/(n-1)
sd(expr_dat[,1])

# CV
height=c(72, 74, 68, 76, 74, 69, 72, 79, 70, 69, 77, 73)
sd(height)/mean(height)*100


install.packages("moments")
library(moments)
skewness(expr_dat[,1])
kurtosis(expr_dat[,1])
hist(expr_dat[,1],freq=F)
x = seq(5.5,8.5,length=100)
lines(x,dnorm(x,mean=mean(expr_dat[,1]),sd(expr_dat[,1])))

# contingency table
#1차원 도수분포표
table(mtcars$cyl)
table(mtcars$am)
#2차원 분할표
table(mtcars$cyl,mtcars$am)
#3차원 분할표
table(mtcars$cyl,mtcars$am,mtcars$gear)

# cov , cor
cov(expr_dat[,1],expr_dat[,5])
cov(expr_dat[,c(1,5,8)])
var(expr_dat[,1])

cor(expr_dat[,1],expr_dat[,5])
cor(expr_dat[,c(1,5,8)])