Differences

This shows you the differences between two versions of the page.

--- day2_practice1 [2017/12/12 11:51] – created hyjeong
+++ day2_practice1 [2021/03/17 13:09] (current) – external edit 127.0.0.1
@@ Line 1: / Line 1: @@
+  .Platform
+  setwd('/BiO/example/dataset')
+  raw_dat = read.csv(file="Ex_data.csv",
+                     header=T,stringsAsFactors=F)
+  head(raw_dat[,1:20]) # 열의수가 많아 1:20의 열만 선택
+  dim(raw_dat)
+  # gl ( group의 수, group의 크기 )
+  # 1,2,group의 수 해당하는 Factor 형 벡터 생성
+  gr_ind = gl(2, 221) # 1 x 221, 2 x 221 = c(1,...,1,2,...,2)
+  dat_mat <- t(as.matrix(raw_dat[,-1]))
+  dim(dat_mat)
+  rownames(dat_mat) <- paste0("S",1:nrow(dat_mat))
+  colnames(dat_mat) <- raw_dat[,1]
+  head(dat_mat[,1:20])
+  indx <- which(is.na(dat_mat),T)
+  indx
+  col_ind = indx[,2]
+  col_m = apply(dat_mat[,col_ind],2,mean,na.rm=T)
+  col_m
+  dat_mat[indx] = col_m
+  sum(is.na(dat_mat))
+  dim(dat_mat)
+  uq_names <- unique(colnames(dat_mat))
+  p <- length(uq_names) ; n <- dim(dat_mat)[1]
+  expr_dat <- matrix(0,n,p)
+  for(i in 1:p) {
+    expr_dat[,i] = apply(as.matrix(
+      dat_mat[,colnames(dat_mat)==uq_names[i]]),1,mean)
+    cat('\n',i,'-th step') }
+  colnames(expr_dat) <- uq_names
+  rownames(expr_dat) <- rownames(dat_mat)
+  head(expr_dat[,1:20])
+  dim(expr_dat); sum(is.na(expr_dat))
+  #Set working directory
+  setwd('./')
+  # Read a dataset
+  dat = read.table("Ex211.txt",header=T)
+  head(dat)
+  dat$Job
+  attach(dat)
+  Job
+  plot(Job, main="직업의 막대그림", ylab="인원수(명)", ylim=c(0,15))
+  box()
+  freq = table(dat[,6])
+  barplot(freq)
+  data(VADeaths)
+  library(gplots)
+  #x11()
+  barplot2(VADeaths, beside = TRUE,
+           col = gray(seq(0.4,0.9,length=5)),
+           legend = rownames(VADeaths),
+           ylim = c(0, 100))
+  title(main = "Death Rates in Virginia",font.main = 4)
+  hh <- t(VADeaths)[,5:1]
+  mybarcol <- "gray20"
+  ci.l <- hh * 0.85
+  ci.u <- hh * 1.15
+  mp <- barplot2(hh, beside = TRUE,
+        col = gray(seq(0.4,0.9,length=5)),
+        legend = colnames(VADeaths),
+        ylim = c(0, 100),
+        main = "Death Rates in Virginia",
+        font.main = 4,
+        sub = "Faked 95 percent error bars",
+        cex.names = 1.5,
+        plot.ci = TRUE,
+        ci.l = ci.l, ci.u = ci.u, plot.grid = TRUE)
+  box()
+  #x11()
+  pie(freq, main="직업의 원그림")
+  pie(rep(1, 24), col = rainbow(24), radius = 0.9)
+  pie.sales <- c(0.12, 0.3, 0.26, 0.16, 0.04, 0.12)
+  lbl =  c("Blueberry", "Cherry",
+           "Apple", "Boston Cream", "Other", "Vanilla Cream")
+  names(pie.sales) = paste0(lbl," (",pie.sales*100,"%)")
+  pie(pie.sales, col=rainbow(length(pie.sales)))
+  # histogram
+  x <- expr_dat[,10]
+  #x11()
+  hist(x,breaks= 20,col="gray",main=uq_names[10])
+  hist(x,breaks= 40,freq=F,col="lightblue",main=uq_names[10])
+  hist(x,breaks= 40,plot=F)
+  #boxplot
+  mat = expr_dat[,c(3,4,7,8)]
+  #x11() # 비어있는 그림 창 생성
+  boxplot(mat)
+  res = boxplot(mat,plot=F)
+  res
+  #x11()
+  c_name = colnames(expr_dat)
+  plot(expr_dat[,1],expr_dat[,2],type='l',xlab=c_name[1],ylab=c_name[2])
+  #windows()
+  plot(expr_dat[,3],expr_dat[,4],type='l',xlab=c_name[1],ylab=c_name[2])
+  # plot
+  pop_dat = read.csv(file='table_2_2.csv')
+  #x11()
+  plot(pop_dat[,1],pop_dat[,2],type='l',xlab='연도',ylab='인구수')
+  #windows()
+  plot(pop_dat[,1],pop_dat[,2],type='b',xlab='연도',ylab='인구수')
+  # scatter plot
+  #x11()
+  # pch는 점의 모양 선택, (e.g., pch=16 => 채워진 원)
+  ind1 = 8; ind2=12
+  plot(expr_dat[,ind1],expr_dat[,ind2],type='p',pch=16,
+       xlab=uq_names[ind1],ylab=uq_names[ind2])
+  cor_mat = cor(expr_dat)
+  which.max(cor_mat[ind1,-ind1])
+  ind1 = 8; ind2=200
+  plot(expr_dat[,ind1],expr_dat[,ind2],type='p',pch=16,
+       xlab=uq_names[ind1],ylab=uq_names[ind2])
+  #x11()
+  #pairs example
+  ind = c(2,8,12,200)
+  pairs(expr_dat[,ind])
+  pairs(expr_dat[,ind], "Expression Data",
+        pch = 21, bg = c("red", "blue")[gr_ind])
+  # Stat
+  mean(expr_dat[,10])
+  median(expr_dat[,10])
+  x = c(1,2,3,1,2,5,5,3,3,3,2)
+  tb_x = table(x); tb_x
+  as.numeric(names(tb_x)[which.max(tb_x)])
+  Mode = function(vec) {
+    tb = table(vec)
+    return(as.numeric(names(tb)[which.max(tb)])) }
+  Mode(x)
+  quantile(expr_dat[,1],0.25)
+  quantile(expr_dat[,1],c(0.25,0.5,0.75))
+  min(expr_dat[,1])
+  max(expr_dat[,1])
+  range(expr_dat[,1])
+  x <- rnorm(100)
+  summary(x) # 수치형 자료의 summary
+  y <- c('red','blue','red','white')
+  summary(y) # 문자형 자료의 summary
+  f.y <- factor(y); summary(f.y) # 요인의 summary
+  var(expr_dat[,1])
+  sum((expr_dat[,1]-mean(expr_dat[,1]))^2)/(n-1)
+  sd(expr_dat[,1])
+  # CV
+  height=c(72, 74, 68, 76, 74, 69, 72, 79, 70, 69, 77, 73)
+  sd(height)/mean(height)*100
+  install.packages("moments")
+  library(moments)
+  skewness(expr_dat[,1])
+  kurtosis(expr_dat[,1])
+  hist(expr_dat[,1],freq=F)
+  x = seq(5.5,8.5,length=100)
+  lines(x,dnorm(x,mean=mean(expr_dat[,1]),sd(expr_dat[,1])))
+  # contingency table
+  #1차원 도수분포표
+  table(mtcars$cyl)
+  table(mtcars$am)
+  #2차원 분할표
+  table(mtcars$cyl,mtcars$am)
+  #3차원 분할표
+  table(mtcars$cyl,mtcars$am,mtcars$gear)
+  # cov , cor
+  cov(expr_dat[,1],expr_dat[,5])
+  cov(expr_dat[,c(1,5,8)])
+  var(expr_dat[,1])
+  cor(expr_dat[,1],expr_dat[,5])
+  cor(expr_dat[,c(1,5,8)])