This is the sidebar. Without it, the main text is too wide!
- Genome project
- Linux
2019년 11월 교육 자료
- 실습용 스크립트 구글 문서라서 접속 환경에 따라 열리지 않을 수도 있습니다.
This is the sidebar. Without it, the main text is too wide!
2019년 11월 교육 자료
##### installing packages example (will not do in workshop) tr() setRepositories() install.packages("psych") library(psych) tr() m <- matrix(1:16,ncol=4) tr(m) ##################################### ################# # Basic plots # ################# x11(width = 6, height = 6) #code to open up a window in linux counts <- matrix( rnorm(200,mean=0,sd=1),20,10 ) #make a normal count boxplot( counts[,1],counts[,3],counts[,5],counts[,7],counts[,9],counts[,10] , col=c("black","blue","green","red","purple","yellow") ) boxplot( counts[,1],counts[,3],counts[,5],counts[,7],counts[,9],counts[,10] , col=c("red","red","red","blue","blue","blue") ) test_data <- c(1,2,2,3,7,1,2,3,10,11,25,34,3,6,7,9,9,10,10,10) #make test data hist(test_data) test_data <- rnorm(500, mean = 0, sd = 1) hist(test_data) hist(test_data, main = "Histogram of Test Dataset", xlab = "Trait", ylab = "Individual Counts" ) dev.off() #or close the window #실습 1교시 마무리. ## Set the working directory (not needed for workshop) #setwd("C:\\PATH\\") ## in pc #setwd("/home/2.RNASeq_practice/5.HTseq_simulated") ##in linux ## Call necessary libraries (these are preinstalled [ex:install.packages("limma")]) library(statmod) library(locfit) library(edgeR) library(limma) library(DESeq2) ## note the bad samples (6 and 10) ## save used samples' index to Sam_Index Samp_Index <- c(1:5,7:9,11:23) ## or you can do Samp_Index <- c(1:23) Samp_Index <- Samp_Index[-c(6,10)] ## take out bad samples meta_data <- read.table("meta_data_mRNA.txt", sep="\t", head = T) File_list <- as.character(meta_data[,1]) File_list <- File_list[Samp_Index] head(File_list) #check first few component of File_list ### Data Read (first file) temp <- read.table(File_list[1], sep="\t", head=F) temp <- temp[-((nrow(temp)-4):nrow(temp)),] #remove last 5 lines Gene_Symbol <- as.character(temp[,1]) Expression <- data.frame(temp[,2]) ## Merge matrix (read all other good sample files and append) for(i in 2:length(File_list)){ #from file_list, 2nd to last files are called since first file is already called temp <- read.table(File_list[i], sep="\t", head=F) temp <- temp[-((nrow(temp)-4):nrow(temp)),] # remove last 5 lines same as the first file Expression <- data.frame(Expression, temp[,2]) } colnames(Expression) <- File_list #samples(columns) are marked with their original file name head(Expression) #check first few lines of Expression dim(Expression) #check dimension of Expression head(Gene_Symbol) #### remove all zero gene for mRNA data #from original 14218 genes Removed_ID <- which(rowSums(Expression) == 0) #1067 genes removed Expression <- Expression[-Removed_ID,] Gene_Symbol <- Gene_Symbol[-Removed_ID] dim(Expression) length(Gene_Symbol) #13151 #### gene annotation ##### ####after annotation file '#bin' delete Gene_annotation <- read.table("workshop_practice.annotation",head=T, stringsAsFactors=FALSE) Gene_annotation <- Gene_annotation[,c(2,13)] ## get information we need: col_2 and col_3 #match Gene_name with Gene_Symbol Gene_name <- c() for ( i in 1:length(Gene_Symbol) ){ Gene_name[i]<- Gene_annotation[match(Gene_Symbol[i],Gene_annotation[,1]),2] } ## Start RNAseq analysis ######################################### ### DEG analysis Set for 1-way Analysis ######################################### Treat <- factor( meta_data[,3], levels = c("male","female") ) Treat <- Treat[Samp_Index] #dimension correction targets <- data.frame(Treat) #################################################### ### Get normalized TMM values #################################################### design <- model.matrix(~Treat, data = targets) colnames(design) Y <- DGEList(counts=Expression, gene=Gene_Symbol) Y <- estimateDisp(Y,design) #Y <- estimateGLMCommonDisp(Y,design) #Y <- estimateGLMTrendedDisp(Y,design) #Y <- estimateGLMTagwiseDisp(Y,design) fit <- glmFit(Y, design) colnames(fit) ## Get normalized values TMM <- calcNormFactors(Y, method="TMM") TMM <- cpm(TMM, normalized.lib.sizes=TRUE, log=T) ### 1-way ANODEV based apporach for testing Stage effects ## female vs. male Result_anodev <- glmLRT(fit, coef=2) Result_anodev_table <- topTags(Result_anodev, n=dim(Expression)[1], sort.by="none")$table sum(Result_anodev_table$FDR < 0.05) #2 head(Result_anodev_table) #to visualize the two significant genes... head(Result_anodev_table[order(Result_anodev_table$FDR),]) #order the table by FDR value (ascending) Gene_name[1011] Gene_name[5473] #quick view of histogram of p-values hist(Result_anodev_table$PValue) hist(Result_anodev_table$PValue, main = "P-value Distribution Male vs. Female", xlab= "P-values", ylab="Count") Final_anodev_table <- cbind (Gene_name,Result_anodev_table) #add the matching gene names to the anodev table ### save the final table results to Result_anodev.txt write.table( Final_anodev_table,"Result_anodev.txt", row.names=F,sep="\t" ) #################################################### ### What to do when you have NO REPLICATES? ### #################################################### ### 1. Be satisfied with a descriptive analysis such as fold change. Do not attempt a significance analysis. ### 2. Simply pick a reasonable dispersion value, based on the experience or study design. bcv <- 0.2 #BCV is square-root-dispersion counts <- matrix( rnbinom(40,size=1/bcv^2,mu=10),20,2 ) y <- DGEList( counts, group=1:2 ) y <- exactTest( y, dispersion = bcv^2 ) y$table ####################### EOF #######################