Question

design matrix edge R pairwise comparison at different timepoints after infection with replicates

0

Entering edit mode

Kaat De Cremer ▴ 70

@kaat-de-cremer-5346

Last seen 9.6 years ago

Dear edgeR patrons, I am using edgeR to find genes differentially expressed between infected and mock-infected control plants, at 3 time points after infection. I have RNAseq data for 3 independent tests, so for every single test I have 6 libraries (control + infected at 3 time points). Having three replicates this makes 18 libraries in total. What I did until now is look at each time point separate and calculate DEgenes at that time point as shown in this script: > head(x) C1 C2 C3 T1 T2 T3 1 0 1 2 0 0 0 2 13 6 4 10 8 12 3 17 16 9 10 8 11 4 2 1 2 2 3 2 5. 1 3 1 2 1 3 0 6 958 457 438 565 429 518 > treatment<-factor(c("C","C","C","T","T","T")) > test<-factor(c(1,2,3,1,2,3)) > y<-DGEList(counts=x,group=treatment) Calculating library sizes from column totals. > cpm.y<-cpm(y) > y<-y[rowSums(cpm.y>2)>=3,] > y<-calcNormFactors(y) > design<-model.matrix(~test+treat) > y<-estimateGLMCommonDisp(y,design,verbose=TRUE) Disp = 0.0265 , BCV = 0.1628 > y<-estimateGLMTrendedDisp(y,design) Loading required package: splines > y<-estimateGLMTagwiseDisp(y,design) > fit<-glmFit(y,design) > lrt<-glmLRT(y,fit) This works fine but I wonder if I should do the analysis of the different time points all at once? Will this make a difference? Unfortunately I cannot figure out how to design the matrix. I hope someone can help me, Kaat [[alternative HTML version deleted]]

RNASeq edgeR RNASeq edgeR • 2.4k views

ADD COMMENT • link updated 8.7 years ago by jribeiro • 0 • written 11.9 years ago by Kaat De Cremer ▴ 70

score 0 · Answer 1 · 2015-08-10

Hi Kaat,

I have a similar problem, exactly same design, two treatments (M/R) and 3 time points, 24,48 and 72 h. Here is the script.

Regards,

Jose Ribeiro

#Note, input is READ count not RPKM, but I filtered for contigs having RPKM=> 10
#I redirect output below, check path for your machine
x <- read.delim("all-rpkm10.txt",row.names="Symbol")
head(x)
# M24_1 M24_2 M24_3 M48_1 M48_2 M48_3 M72_1
#Ir-SigP-5025_FR5_171-225 73300 437360 549622 602856 659898 505265 461383
#Ir-SigP-261154_FR6_668-747 91394 597844 766361 871606 904314 664936 592187
#Ir-SigP-15814_FR2_11-76 151729 466056 589993 669327 713620 555262 499542
#Ir-SigP-238894_FR6_217-303 109401 548717 706688 759050 819945 630981 556457
#Ir-273528 513723 477842 625334 243488 436690 317665 363292
#Ir-261740 450616 404725 529356 212800 377810 276754 307546
# M72_2 M72_3 R24_1 R24_2 R24_3 R48_1 R48_2
#Ir-SigP-5025_FR5_171-225 578678 492101 636488 689458 848344 1088458 718386
#Ir-SigP-261154_FR6_668-747 775722 606867 918875 982119 1069844 1473023 993263
#Ir-SigP-15814_FR2_11-76 641585 509077 671750 746265 885797 1066416 785909
#Ir-SigP-238894_FR6_217-303 716663 579172 807450 882311 1078659 1270305 906349
#Ir-273528 392772 208231 529106 693211 481182 415799 362500
#Ir-261740 350966 194612 444408 572590 399377 350478 314461
# R48_3 R72_1 R72_2 R72_3
#Ir-SigP-5025_FR5_171-225 779567 770357 107456 633257
#Ir-SigP-261154_FR6_668-747 1094015 1097808 131793 863420
#Ir-SigP-15814_FR2_11-76 786518 816780 164134 667976
#Ir-SigP-238894_FR6_217-303 926835 966339 132874 781580
#Ir-273528 577026 316752 316714 394431
#Ir-261740 493309 268045 275242 346463

targets = read.delim("dge.list.txt")
#

head(targets)
# Symbol Treat Time lib.size norm.factor
#1 M24_1 M 24 25418308 1
#2 M24_2 M 24 20485372 1
#3 M24_3 M 24 24126994 1
#4 M48_1 M 48 20712009 1
#5 M48_2 M 48 23378038 1
#6 M48_3 M 48 21433471 1
#etc

#load experimental design
#
Time <- factor(c(24,24,24,48,48,48,72,72,72,24,24,24,48,48,48,72,72,72))
Treat <- factor(c("M","M","M","M","M","M","M","M","M","R","R","R","R","R","R","R","R","R"))
Group <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6)
y <- DGEList(counts=x,group=Group)
#
#normalize data
#
y <- calcNormFactors(y)
y$samples
#
# output
# group lib.size norm.factors
#M24_1 1 24840841 1.1143623
#M24_2 1 20057068 1.0036531
#M24_3 1 23604356 1.0823090
#M48_1 2 20328600 0.9721955
#M48_2 2 22938720 0.9353933
#M48_3 2 20941356 1.1539243
#M72_1 3 21952451 1.0681859
#M72_2 3 24321079 1.2883963
#M72_3 3 16778586 1.4357941
#R24_1 4 22747712 0.7207162
#R24_2 4 23904095 0.6871886
#R24_3 4 24595971 0.9272390
#R48_1 5 28033832 0.9396293
#R48_2 5 24341516 0.9390415
#R48_3 5 26004057 0.8688401
#R72_1 6 23384795 0.9517134
#R72_2 6 22311379 1.1321000
#R72_3 6 24086436 1.0503182
#
data.frame(Sample=colnames(y),Treat,Time)
# Sample Treat Time
#1 M24_1 M 24
#2 M24_2 M 24
#3 M24_3 M 24
#4 M48_1 M 48
#5 M48_2 M 48
#6 M48_3 M 48
#7 M72_1 M 72
#8 M72_2 M 72
#9 M72_3 M 72
#10 R24_1 R 24
#11 R24_2 R 24
#12 R24_3 R 24
#13 R48_1 R 48
#14 R48_2 R 48
#15 R48_3 R 48
#16 R72_1 R 72
#17 R72_2 R 72
#18 R72_3 R 72
design <- model.matrix(~Time+Treat)
rownames(design) <- colnames(y)
design
# output
# (Intercept) Time48 Time72 TreatR
#M24_1 1 0 0 0
#M24_2 1 0 0 0
#M24_3 1 0 0 0
#M48_2 1 1 0 0
#M48_3 1 1 0 0
#M72_1 1 0 1 0
#M72_2 1 0 1 0
#M72_3 1 0 1 0
#R24_1 1 0 0 1
#R24_2 1 0 0 1
#R24_3 1 0 0 1
#R48_1 1 1 0 1
#R48_2 1 1 0 1
#R48_3 1 1 0 1
#R72_1 1 0 1 1
#R72_2 1 0 1 1
#R72_3 1 0 1 1
#attr(,"assign")
#[1] 0 1 1 2
#attr(,"contrasts")
#attr(,"contrasts")$Time
#[1] "contr.treatment"
#
#attr(,"contrasts")$Treat
#[1] "contr.treatment"
#Plots multi-dimensional scaling plot (same concept, but different from, PCA)
colors<-c("black","black","black","red","red","red","blue","blue","blue")
plotMDS(y,col=colors,main = "MDS Plot for Count Data RPKM > 10")
#
#predFC {edgeR} Computes estimated coefficients for a NB glm (Negative Binomial Generalized Linear Model) in such a way that the log-fold-changes are shrunk towards zero.
#The higher prior.n, the closer the estimates will be to the common dispersion. The recommended value is the nearest integer to 50/(#samples - #groups).
#
dim(x)
y <- estimateCommonDisp(y)
names(y)
#[1] 20773 18 #rows and samples
y$common.dispersion
#[1] 0.369717
dispersion <- y$common.dispersion
#The higher prior.n, the closer the estimates will be to the common dispersion. The recommended value is the nearest integer to 50/(#samples - #groups).
#In our case #samples = 18, #groups = 6 or 50/(18-6) = 4
logFC <- predFC(y,design,prior.count=4,dispersion=dispersion)
sink("N:/temp/logFC.txt")
logFC
sink()
cor(logFC[,1:4])
# output
# (Intercept) Time48 Time72 TreatR
#(Intercept) 1.00000000 -0.197490299 -0.28046068 -0.044456142
#Time48 -0.19749030 1.000000000 0.73210204 -0.009613076
#Time72 -0.28046068 0.732102039 1.00000000 -0.026014629
#TreatR -0.04445614 -0.009613076 -0.02601463 1.000000000

#Now proceed to determine differentially expressed genes. Fit genewise glms:

y <- estimateGLMCommonDisp(y, design, verbose=TRUE)
#Disp = 0.43976 , BCV = 0.6631
y <- estimateGLMTrendedDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
jpeg("n:/temp/BCV.jpg")
plotBCV(y)
dev.off()
fit <- glmFit(y, design)
#
#get list of FDR<0.05 genes for all data
#
lrt <- glmLRT(fit,coef=2:4)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sum(FDR < 0.05)
#output
#[1] 1279
sink("n:/temp/edgeR-all-time-treatment.txt")
topTags(lrt,n=sum(FDR < 0.05))
sink()
#
#get list of FDR<0.05 genes 48h
#
lrt <- glmLRT(fit,coef=,2)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sink("N:/temp/edgeR-48h.txt")
topTags(lrt,n=sum(FDR < 0.05))
sink()
#
#get list of FDR<0.05 genes 72h
#
lrt <- glmLRT(fit,coef=,3)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sink("N:/temp/edgeR-72h.txt")
topTags(lrt,n=sum(FDR < 0.05))
sink()
#
#get list of FDR<0.05 genes treatment
lrt <- glmLRT(fit,coef=,4)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sink("N:/temp/edgeR-Treat.txt")
topTags(lrt,n=sum(FDR < 0.05))
sink()
#
#========================================DEG PLOTS (similar in purpose to volcano plots=================================
#obtain DEG plots
lrt <- glmLRT(fit,coef=,4)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sum(FDR < 0.05)
top <- rownames(topTags(lrt))
cpm(y)[top,]
summary(dt <- decideTestsDGE(lrt))
isDE <- as.logical(dt)
DEnames <- rownames(y)[isDE]
jpeg("N:/temp/DEG-treat.jpg")
plotSmear(lrt, de.tags=DEnames,main="Treatment")
abline(h=c(-1,1), col="blue")
dev.off()
#time 48h
lrt <- glmLRT(fit,coef=,2)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sum(FDR < 0.05)
top <- rownames(topTags(lrt))
cpm(y)[top,]
summary(dt <- decideTestsDGE(lrt))
isDE <- as.logical(dt)
DEnames <- rownames(y)[isDE]
jpeg("N:/temp/DEG-48h.jpg")
plotSmear(lrt, de.tags=DEnames,main="48 h")
abline(h=c(-1,1), col="blue")
dev.off()
#time 72h
lrt <- glmLRT(fit,coef=,3)
FDR <- p.adjust(lrt$table$PValue, method="BH")
sum(FDR < 0.05)
top <- rownames(topTags(lrt))
cpm(y)[top,]
summary(dt <- decideTestsDGE(lrt))
isDE <- as.logical(dt)
DEnames <- rownames(y)[isDE]
jpeg("N:/temp/DEG-72h.jpg")
plotSmear(lrt, de.tags=DEnames,main="72 h")
abline(h=c(-1,1), col="blue")
dev.off()