.Platform setwd('/BiO/example/dataset') raw_dat = read.csv(file="Ex_data.csv", header=T,stringsAsFactors=F) head(raw_dat[,1:20]) # 열의수가 많아 1:20의 열만 선택 dim(raw_dat) # gl ( group의 수, group의 크기 ) # 1,2,group의 수 해당하는 Factor 형 벡터 생성 gr_ind = gl(2, 221) # 1 x 221, 2 x 221 = c(1,...,1,2,...,2) dat_mat <- t(as.matrix(raw_dat[,-1])) dim(dat_mat) rownames(dat_mat) <- paste0("S",1:nrow(dat_mat)) colnames(dat_mat) <- raw_dat[,1] head(dat_mat[,1:20]) indx <- which(is.na(dat_mat),T) indx col_ind = indx[,2] col_m = apply(dat_mat[,col_ind],2,mean,na.rm=T) col_m dat_mat[indx] = col_m sum(is.na(dat_mat)) dim(dat_mat) uq_names <- unique(colnames(dat_mat)) p <- length(uq_names) ; n <- dim(dat_mat)[1] expr_dat <- matrix(0,n,p) for(i in 1:p) { expr_dat[,i] = apply(as.matrix( dat_mat[,colnames(dat_mat)==uq_names[i]]),1,mean) cat('\n',i,'-th step') } colnames(expr_dat) <- uq_names rownames(expr_dat) <- rownames(dat_mat) head(expr_dat[,1:20]) dim(expr_dat); sum(is.na(expr_dat)) #Set working directory setwd('./') # Read a dataset dat = read.table("Ex211.txt",header=T) head(dat) dat$Job attach(dat) Job plot(Job, main="직업의 막대그림", ylab="인원수(명)", ylim=c(0,15)) box() freq = table(dat[,6]) barplot(freq) data(VADeaths) library(gplots) #x11() barplot2(VADeaths, beside = TRUE, col = gray(seq(0.4,0.9,length=5)), legend = rownames(VADeaths), ylim = c(0, 100)) title(main = "Death Rates in Virginia",font.main = 4) hh <- t(VADeaths)[,5:1] mybarcol <- "gray20" ci.l <- hh * 0.85 ci.u <- hh * 1.15 mp <- barplot2(hh, beside = TRUE, col = gray(seq(0.4,0.9,length=5)), legend = colnames(VADeaths), ylim = c(0, 100), main = "Death Rates in Virginia", font.main = 4, sub = "Faked 95 percent error bars", cex.names = 1.5, plot.ci = TRUE, ci.l = ci.l, ci.u = ci.u, plot.grid = TRUE) box() #x11() pie(freq, main="직업의 원그림") pie(rep(1, 24), col = rainbow(24), radius = 0.9) pie.sales <- c(0.12, 0.3, 0.26, 0.16, 0.04, 0.12) lbl = c("Blueberry", "Cherry", "Apple", "Boston Cream", "Other", "Vanilla Cream") names(pie.sales) = paste0(lbl," (",pie.sales*100,"%)") pie(pie.sales, col=rainbow(length(pie.sales))) # histogram x <- expr_dat[,10] #x11() hist(x,breaks= 20,col="gray",main=uq_names[10]) hist(x,breaks= 40,freq=F,col="lightblue",main=uq_names[10]) hist(x,breaks= 40,plot=F) #boxplot mat = expr_dat[,c(3,4,7,8)] #x11() # 비어있는 그림 창 생성 boxplot(mat) res = boxplot(mat,plot=F) res #x11() c_name = colnames(expr_dat) plot(expr_dat[,1],expr_dat[,2],type='l',xlab=c_name[1],ylab=c_name[2]) #windows() plot(expr_dat[,3],expr_dat[,4],type='l',xlab=c_name[1],ylab=c_name[2]) # plot pop_dat = read.csv(file='table_2_2.csv') #x11() plot(pop_dat[,1],pop_dat[,2],type='l',xlab='연도',ylab='인구수') #windows() plot(pop_dat[,1],pop_dat[,2],type='b',xlab='연도',ylab='인구수') # scatter plot #x11() # pch는 점의 모양 선택, (e.g., pch=16 => 채워진 원) ind1 = 8; ind2=12 plot(expr_dat[,ind1],expr_dat[,ind2],type='p',pch=16, xlab=uq_names[ind1],ylab=uq_names[ind2]) cor_mat = cor(expr_dat) which.max(cor_mat[ind1,-ind1]) ind1 = 8; ind2=200 plot(expr_dat[,ind1],expr_dat[,ind2],type='p',pch=16, xlab=uq_names[ind1],ylab=uq_names[ind2]) #x11() #pairs example ind = c(2,8,12,200) pairs(expr_dat[,ind]) pairs(expr_dat[,ind], "Expression Data", pch = 21, bg = c("red", "blue")[gr_ind]) # Stat mean(expr_dat[,10]) median(expr_dat[,10]) x = c(1,2,3,1,2,5,5,3,3,3,2) tb_x = table(x); tb_x as.numeric(names(tb_x)[which.max(tb_x)]) Mode = function(vec) { tb = table(vec) return(as.numeric(names(tb)[which.max(tb)])) } Mode(x) quantile(expr_dat[,1],0.25) quantile(expr_dat[,1],c(0.25,0.5,0.75)) min(expr_dat[,1]) max(expr_dat[,1]) range(expr_dat[,1]) x <- rnorm(100) summary(x) # 수치형 자료의 summary y <- c('red','blue','red','white') summary(y) # 문자형 자료의 summary f.y <- factor(y); summary(f.y) # 요인의 summary var(expr_dat[,1]) sum((expr_dat[,1]-mean(expr_dat[,1]))^2)/(n-1) sd(expr_dat[,1]) # CV height=c(72, 74, 68, 76, 74, 69, 72, 79, 70, 69, 77, 73) sd(height)/mean(height)*100 install.packages("moments") library(moments) skewness(expr_dat[,1]) kurtosis(expr_dat[,1]) hist(expr_dat[,1],freq=F) x = seq(5.5,8.5,length=100) lines(x,dnorm(x,mean=mean(expr_dat[,1]),sd(expr_dat[,1]))) # contingency table #1차원 도수분포표 table(mtcars$cyl) table(mtcars$am) #2차원 분할표 table(mtcars$cyl,mtcars$am) #3차원 분할표 table(mtcars$cyl,mtcars$am,mtcars$gear) # cov , cor cov(expr_dat[,1],expr_dat[,5]) cov(expr_dat[,c(1,5,8)]) var(expr_dat[,1]) cor(expr_dat[,1],expr_dat[,5]) cor(expr_dat[,c(1,5,8)])