Classic or GLM approach in edgeR, Why do I got less DEG in GLM approach?
1
0
Entering edit mode
RR • 0
@7e31e763
Last seen 6 weeks ago
Thailand

Hello all, I am doing gene expression profiling of 3 structures(BB, 79A, 28z). I have 9 samples from 3 donors. Each donor has 3 samples, representing 3 structures. For example Donor 1 has structure BB, 79A, 28z , Donor 2 has structure BB, 79A, 28z Donor 3 has structure BB, 79A, 28z. I would like to see the differentially expressed gene of structure 79A. I compare structure 79A vs 28z, 79A vs BB and 28z vs BB. I have tried both classic and glm approach. I got 24 DEGs in classic but only 6 DEGs in GLMs approach. Thank you in advance!!

The following are my script in classic approach (comparing 79A vs 28z)

counts <- read.table('NewCounting_file.txt')
    head(counts)
    str(counts)
    #create gene name and gene id variable
    gene_name <- counts[,2]
    gene_id <- counts[,1]
    class(counts)
    head(gene_id)
    head(gene_name)
    #Delete gene id and gene name column on counts 
    counts[1:2] <- list(NULL)
    head(counts)
    ##Delete BB group V3 &V6
    counts[1] <- list(NULL)
    counts[3] <- list(NULL)
    head(counts)
    #create gene count numeric matrix 
    counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
    head(counts_mat)
    class(counts_mat) 
    row.names(counts_mat) <- gene_name
    head(counts_mat)
    newsample_name <- c("79A","28z","79A","28z","79A","28z")
    colnames(counts_mat) <- newsample_name
    head(counts_mat)
    library(edgeR)
    #create group variables
    groups79A_28z <- c("79A","28z","79A","28z","79A","28z")
    geneexp79A_28z <- DGEList(counts=counts_mat, group=groups79A_28z)
    head(geneexp79A_28z)
    geneexp79A_28z <- calcNormFactors(geneexp79A_28z)
    head(geneexp79A_28z)
    geneexp79A_28z<- estimateCommonDisp(geneexp79A_28z)
    head(geneexp79A_28z)
    geneexp79A_28z <- estimateTagwiseDisp(geneexp79A_28z)
    head(geneexp79A_28z)
    filterByExpr(geneexp79A_28z)
    head(geneexp79A_28z)
    genediff79A_28z <- exactTest(geneexp79A_28z)
    genediff79A_28z 
    topTags(genediff79A_28z)
    str(genediff79A_28z)
    genediff79A_28z$table <- cbind(genediff79A_28z$table, FDR=p.adjust(genediff79A_28z$table$PValue, method ='fdr'))
    head(genediff79A_28z)
    str(genediff79A_28z)
    gsign79A_28z <- genediff79A_28z$table[genediff79A_28z$table$FDR<0.05,]
    gsign79A_28z <- gsign79A_28z[order(gsign79A_28z$FDR),]
    dim(gsign79A_28z)
    head(gsign79A_28z)
---------

Below is my glm approach

counts <- read.table('NewCounting_file.txt')
head(counts)
str(counts)
#create gene name and gene id variable
gene_name <- counts[,2]
gene_id <- counts[,1]
class(counts)
head(gene_id)
head(gene_name)
#Delete gene id and gene name column on counts 
counts[1:2] <- list(NULL)
head(counts)
##Delete BB group V3 &V6
counts[1] <- list(NULL)
counts[3] <- list(NULL)
head(counts)
#create gene count numeric matrix 
counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
head(counts_mat)
class(counts_mat) 
row.names(counts_mat) <- gene_name
head(counts_mat)
newsample_name <- c("79A","28z","79A","28z","79A","28z")
colnames(counts_mat) <- newsample_name
head(counts_mat)
library(edgeR)
#create group variables
group  <- c("79A","28z","79A","28z","79A","28z")
dglist  <- DGEList(counts=counts_mat, group=group)
keep <-filterByExpr(dglist)
summary(keep)
dglist <- dglist[keep, ,keep.lib.sizes=FALSE]
dim(dglist)
dglist <- calcNormFactors(dglist, method="TMM")
plotMDS(dglist)
dglist$samples
#Design matrix
group <- factor(dglist$samples, levels=c("79A","28z"))
design <- model.matrix(~0 +group)
design
dglist<- estimateDisp(dglist)
dglist$common.dispersion
plotBCV(dglist)
fit <- glmQLFit(dglist)
head(fit$coefficients)
plotQLDisp(fit)
edgeR DEGs • 253 views
ADD COMMENT
2
Entering edit mode
@gordon-smyth
Last seen 33 minutes ago
WEHI, Melbourne, Australia

See Section 3.4.2 of the edgeR User's Guide for how to analyse paired samples from three Donors. The Donor structure is a type of blocking.

You absolutely have to use glms but neither of your scripts are correct at the moment. You need to analyse all 9 samples together and you need to take account of which samples are from which Donor.

ADD COMMENT
0
Entering edit mode

Could you please help me explain the coef value? how to calculate its value ? Currently I got only 8 samples as another 1 is still in the process of sequencing but I want to try analyze these 8 samples first. Does the order of treatment ( A, B, C, B,A,C , C,B,A) correlate with coef value? Thank you so much for your comment, It means a lots to me, I try so hard to understand and writing the new script.

ADD REPLY
0
Entering edit mode

Excuse me Gordon Smyth, Could you help me review my new script following your guidance last 2 days please, I have tried to write again. I have attached the pheno column in the picture. Currently I have only 8 samples to analyze. Thank you in advance!

    #Transform HTseq count output into dataframe
    counts <- read.table('NewCounting_file.txt')
    head(counts)
    #checking the structure of data.frame
    str(counts)
    #create gene name and gene id variable
    gene_name <- counts[,2]
    gene_id <- counts[,1]
    class(counts)
    head(gene_id)
    head(gene_name)
    #Delete gene id and gene name column on counts 
    counts[1:2] <- list(NULL)
    head(counts)
    #create gene count numeric matrix 
    counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
    head(counts_mat)
    class(counts_mat)
    #Assign gene name as row name 
    row.names(counts_mat) <- gene_name
    head(counts_mat)
    str(counts_mat)
    counts_mat
    #Assign group as column name 
    sample_name <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
    colnames(counts_mat) <- sample_name
    head(counts_mat)
    ## read phenotype data in ============
    group <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
    Pheno <- read.csv("mypheno.csv", stringsAsFactors = TRUE)
    dglist <- DGEList(counts=counts_mat, samples = Pheno, group = group)
    str(dglist)
    dglist
    dim(dglist)
    keep <-filterByExpr(dglist)
    summary(keep)
    dglist <- dglist[keep, ,keep.lib.sizes=FALSE]
    dim(dglist)
    #confirm that the num of gene in keep is equal to the number of gene/rows in dglist. if the same R will give ans TRUE
    length(keep[keep == TRUE]) == dim(dglist) [1]
    dglist <- calcNormFactors(dglist, method="TMM")
    plotMDS(dglist)
    dglist$samples
    dglist
    #Design matrix
    donors <- factor(dglist$sample$Donors)
    groups <- factor(dglist$sample$Structures)
    design <- model.matrix(~0 + donors + groups)
    design

    #By estimating gene dispersion, we are estimating the relative variability of true expression levle btw replicates
    #To estimate common ,trended and tagwise dispersion in one run :
    dglist<- estimateDisp(dglist, design, robust=TRUE)
    dglist$common.dispersion
    plotBCV(dglist)
    #Fitting
    fit <- glmQLFit(dglist,design, robust=TRUE)
    head(fit$coefficients)
    colnames(fit)
    plotQLDisp(fit)

    ##############DGE
    ##To detect genes that are differentially expressed btw any of three ttt
    qlf <- glmQLFTest(fit, coef=4:5)
    topTags(qlf)
    FDR <- p.adjust(qlf$table$PValue, method='fdr')
    sum(FDR < 0.05)
    summary(decideTests(qlf))
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")
    ###############################

    #detect gene that are differential expressed in 79A vs BBz
    qlf_79A_BBz <- glmQLFTest(fit, coef=4)
    # Show top 10 differentially expressed genes (DEG)
    topTags(qlf_79A_BBz)
    FDR <- p.adjust(qlf_79A_BBz$table$PValue, method='fdr')
    sum(FDR < 0.05)
    summary(decideTests(qlf_79A_BBz))
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf_79A_BBz))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf_79A_BBz))
    tab79A_BBz <- as.data.frame(topTags(qlf_79A_BBz, n=30))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf_79A_BBz)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")
    result_79A_BBz <- topTags(qlf_79A_BBz, n=Inf)
    write.csv(result_79A_BBz, file="result_79A_BBz.csv")


    ###############################
    #detect gene that are differential expressed in BB vs 28z
    qlf_BB_28z <- glmQLFTest(fit, coef=5)
    topTags(qlf_BB_28z)
    FDR <- p.adjust(qlf_BB_28z$table$PValue, method='fdr')
    sum(FDR < 0.05)
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf_BB_28z))
    topFDR <- top <- rownames(topTags(qlf_BB_28z))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf_BB_28z))
    tabBB_28z <- as.data.frame(topTags(qlf_BB_28z, n= 30))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf_BB_28z)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")

    ######################################
    #detect gene that are differential expressed in 28z vs 79A
    qlf28z_79A <- glmQLFTest(fit, contrast=c(0,0,0,-1,1))
    topTags(qlf28z_79A)
    FDR <- p.adjust(qlf28z_79A$table$PValue, method='fdr')
    sum(FDR < 0.05)
    summary(decideTests(qlf28z_79A))
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf28z_79A))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf28z_79A))
    tab28z_79A <- as.data.frame(topTags(qlf28z_79A, n=30))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf28z_79A)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")!

enter image description here

ADD REPLY

Login before adding your answer.

Traffic: 370 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6