Question

Deleted:EdgeR, Please help me review my R script in glm approach

0

Entering edit mode

RR • 0

@7e31e763

Last seen 23 months ago

Thailand

Hello all, I am doing gene expression profiling of 3 structures(BB, 79A, 28z). I have 9 samples from 3 donors. Each donor has 3 samples, representing 3 structures. For example Donor 1 has structure BB, 79A, 28z , Donor 2 has structure BB, 79A, 28z Donor 3 has structure BB, 79A, 28z. I would like to see the differentially expressed gene of structure 79A. I compare structure 79A vs 28z, 79A vs BB and 28z vs BB. I have attached the pheno column in the picture. Currently I got only 8 samples as another 1 is still in the process of sequencing but I want to try analyze these 8 samples first. I am trying to write my script in glm approach. I am a beginner and quite new to edgeR and I would like to ask for your help to review and confirm whether what I wrote is correct. I have got DEGs from the scripts below however I am not quite reliable with the result. Thank you in advance!!

library(edgeR)

counts <- read.table('NewCounting_file.txt')
head(counts)

str(counts)
#create gene name and gene id variable
gene_name <- counts[,2]
gene_id <- counts[,1]
class(counts)
head(gene_id)
head(gene_name)
#Delete gene id and gene name column on counts 
counts[1:2] <- list(NULL)
head(counts)
#create gene count numeric matrix 
counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
head(counts_mat)
class(counts_mat)
#Assign gene name as row name 
row.names(counts_mat) <- gene_name
head(counts_mat)
#check data type before processing
#Make sure data is numeric matrix containing gene name as row and sample name as column
str(counts_mat)
counts_mat
#Assign group as column name 
sample_name <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
colnames(counts_mat) <- sample_name
head(counts_mat)
## read phenotype data in 
group <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
Pheno <- read.csv("mypheno.csv", stringsAsFactors = TRUE)![enter image description here][1]
dglist <- DGEList(counts=counts_mat, samples = Pheno, group = group)
str(dglist)
keep <-filterByExpr(dglist)
summary(keep)
dglist <- dglist[keep, ,keep.lib.sizes=FALSE]
dim(dglist)

dglist <- calcNormFactors(dglist, method="TMM")
plotMDS(dglist)
dglist$samples
dglist
#Design matrix
donors <- factor(dglist$sample$Donors)
groups <- factor(dglist$sample$Structures)
design <- model.matrix(~0 + donors + groups)
design

dglist<- estimateDisp(dglist, design, robust=TRUE)
dglist$common.dispersion
plotBCV(dglist)
#Fitting
fit <- glmQLFit(dglist,design, robust=TRUE)
head(fit$coefficients)
colnames(fit)
plotQLDisp(fit)

##############DGE
##To detect genes that are differentially expressed btw any of three treatments
qlf <- glmQLFTest(fit, coef=4:5)
topTags(qlf)
FDR <- p.adjust(qlf$table$PValue, method='fdr')
sum(FDR < 0.05)
summary(decideTests(qlf))
#closer look at individual counts per million for the top gens
top <- rownames(topTags(qlf))
cpm(dglist)[top,]
#total num of genes significantly up-regulated or down regulated at 5%FDR 
summary(decideTests(qlf))
#plot all the logFCs against average count size, highlighting the DE genes
plotMD(qlf)
# the blue line indicate 2 fold up or down
abline(h=c(-1,1), col="blue")
###############################

#detect gene that are differential expressed in 79A vs BBz
qlf_79A_BBz <- glmQLFTest(fit, coef=4)
# Show top 10 differentially expressed genes (DEG)
topTags(qlf_79A_BBz)
#total num of genes significantly up-regulated or down regulated at 5%FDR 
summary(decideTests(qlf_79A_BBz))
#closer look at individual counts per million for the top gens
top <- rownames(topTags(qlf_79A_BBz))
cpm(dglist)[top,]
#plot all the logFCs against average count size, highlighting the DE genes
plotMD(qlf_79A_BBz)
# the blue line indicate 2 fold up or down
abline(h=c(-1,1), col="blue")
#Constructing a table showing differential expression with edge R, trimming out those p-value>=0.05
qlf_79A_BBz$table <- cbind(qlf_79A_BBz$table, FDR=p.adjust(qlf_79A_BBz$table$PValue, method ='fdr'))
head(qlf_79A_BBz)
str(qlf_79A_BBz)

gsign_79A_BBz <- qlf_79A_BBz$table[qlf_79A_BBz$table$FDR<0.05,]
gsign_79A_BBz <- gsign_79A_BBz[order(gsign_79A_BBz$FDR),]
dim(gsign_79A_BBz)
head(gsign_79A_BBz)
head(gsign_79A_BBz,30)
str(gsign_79A_BBz)
summary(gsign_79A_BBz)

##convert to df

gsign_79A_BBz_df <-data.frame(gsign_79A_BBz)
head(gsign_79A_BBz_df)
view(gsign_79A_BBz_df)
new_gsign_79A_BBz <- cbind(rownames(gsign_79A_BBz_df),gsign_79A_BBz_df)
view(gsign_79A_BBz_df)
rownames(new_gsign_79A_BBz)<-NULL 
view(new_gsign_79A_BBz)
colnames(new_gsign_79A_BBz)[1]<-"genename"
View(new_gsign_79A_BBz)
head(new_gsign_79A_BBz)
#save file
write.csv(new_gsign_79A_BBz, file="new_gsign_79A_BBz.csv")
###############################
###############################
#detect gene that are differential expressed in BB vs 28z
qlf_BB_28z <- glmQLFTest(fit, coef=5)
topTags(qlf_BB_28z)
#closer look at individual counts per million for the top gens
top <- rownames(topTags(qlf_BB_28z))
topFDR <- top <- rownames(topTags(qlf_BB_28z))
cpm(dglist)[top,]
#total num of genes significantly up-regulated or down regulated at 5%FDR 
summary(decideTests(qlf_BB_28z))
#plot all the logFCs against average count size, highlighting the DE genes
plotMD(qlf_BB_28z)
# the blue line indicate 2 fold up or down
abline(h=c(-1,1), col="blue")
#Constructing a table showing differential expression with edge R, trimming out those p-value>=0.05
qlf_BB_28z$table <- cbind(qlf_BB_28z$table, FDR=p.adjust(qlf_BB_28z$table$PValue, method ='fdr'))
head(qlf_BB_28z)
str(qlf_BB_28z)

gsign_BB_28z <- qlf_BB_28z$table[qlf_BB_28z$table$FDR<0.05,]
gsign_BB_28z <- gsign_BB_28z[order(gsign_BB_28z$FDR),]
dim(gsign_BB_28z)
head(gsign_BB_28z)
head(gsign_BB_28z,30)
gsign_BB_28z
head(gsign_BB_28z)
str(gsign_BB_28z)
summary(gsign_BB_28z)
##convert to df
gsign_BB_28z_df <-data.frame(gsign_BB_28z)
head(gsign_BB_28z_df)
view(gsign_BB_28z_df)
new_gsign_BB_28z <- cbind(rownames(gsign_BB_28z_df),gsign_BB_28z_df)
view(gsign_BB_28z_df)
rownames(new_gsign_BB_28z)<-NULL 
view(new_gsign_BB_28z)
colnames(new_gsign_BB_28z)[1]<-"genename"
View(new_gsign_BB_28z)
head(new_gsign_BB_28z)
##save file
write.csv(new_gsign_BB_28z, file="new_gsign_BB_28z.csv")

######################################
######################################
#detect gene that are differential expressed in 28z vs 79A
qlf28z_79A <- glmQLFTest(fit, contrast=c(0,0,0,-1,1))
topTags(qlf28z_79A)
#total num of genes significantly up-regulated or down regulated at 5%FDR 
summary(decideTests(qlf28z_79A))
#closer look at individual counts per million for the top gens
top <- rownames(topTags(qlf28z_79A))
cpm(dglist)[top,]
#plot all the logFCs against average count size, highlighting the DE genes
plotMD(qlf_BB_28z)
# the blue line indicate 2 fold up or down
abline(h=c(-1,1), col="blue")
#Constructing a table showing differential expression with edge R, trimming out those p-value>=0.05
qlf28z_79A$table <- cbind(qlf28z_79A$table, FDR=p.adjust(qlf28z_79A$table$PValue, method ='fdr'))
head(qlf28z_79A)
str(qlf28z_79A)

gsign_28z_79A <- qlf28z_79A$table[qlf28z_79A$table$FDR<0.05,]
gsign_28z_79A <- gsign_28z_79A[order(gsign_28z_79A$FDR),]
dim(gsign_28z_79A)
head(gsign_28z_79A)
head(gsign_28z_79A,30)
str(gsign_28z_79A)
summary(gsign_28z_79A)
##convert to data frame
gsign_28z_79A_df <-data.frame(gsign_28z_79A)
head(gsign_28z_79A_df)
view(gsign_28z_79A_df)
new_gsign_28z_79A <- cbind(rownames(gsign_28z_79A_df),gsign_28z_79A_df)
view(gsign_28z_79A_df)
rownames(new_gsign_28z_79A)<-NULL 
view(new_gsign_28z_79A)
colnames(new_gsign_28z_79A)[1]<-"genename"
View(new_gsign_28z_79A)
head(new_gsign_28z_79A)
write.csv(new_gsign_28z_79A, file="new_gsign_28z_79A.csv")
######################################

edgeR DEGs • 614 views

ADD COMMENT • link updated 2.1 years ago by Kevin Blighe ★ 3.9k • written 2.1 years ago by RR • 0