I am analyzing a time course RNA-seq dataset (3 replicates) with 5 timepoints (0h, 1h, 2h, 6h, 24h) and 2 treatment conditions (drug1 and drug2). I want to find out the changes in gene expression patterns across 0-24 hours upon drug treatment. Below is the code I used. My questions:
Why are the logFC values all negative? Am I doing something wrong?
library(edgeR)
library(limma)
x <- read.csv("sample_matrix_rawcount_Tg.csv", header=T, stringsAsFactors=FALSE, row.names=1)
x <- data.matrix(x)
x
Baseline.0h_1 DMSO.1h_1 Tg.1h_1 DMSO.2h_1 Tg.2h_1 DMSO.6h_1 Tg.6h_1
Xkr4 680 650 751 783 542 886 1183
Rp1 2 9 8 22 22 0 1
Sox17 25 40 42 11 53 3 1
...
DMSO.24h_1 Tg.24h_1 Baseline.0h_2 DMSO.1h_2 Tg.1h_2 DMSO.2h_2 Tg.2h_2
Xkr4 1227 1885 1021 1000 477 706 505
Rp1 0 0 0 0 29 0 21
Sox17 0 9 0 0 73 0 14
...
DMSO.6h_2 Tg.6h_2 DMSO.24h_2 Tg.24h_2 Baseline.0h_3 DMSO.1h_3 Tg.1h_3
Xkr4 1071 337 990 1287 930 998 754
Rp1 1 13 0 0 0 0 3
Sox17 1 67 0 0 0 0 44
...
DMSO.2h_3 Tg.2h_3 DMSO.6h_3 Tg.6h_3 DMSO.24h_3 Tg.24h_3
Xkr4 784 566 1125 365 1088 1585
Rp1 0 28 0 16 0 0
Sox17 0 73 0 31 0 0
time<-factor(c("0h","1h", "1h",
"2h", "2h",
"6h", "6h",
"24h","24h",
"0h", "1h", "1h",
"2h", "2h",
"6h", "6h",
"24h","24h",
"0h","1h", "1h",
"2h", "2h",
"6h", "6h",
"24h","24h"))
y <- DGEList(counts=x[,1:27], genes = x[row.names(x)], group = time) #read counts.csv and make table
y$samples
levels(y$samples$group)
dim(y)
[1] 24421 27
head(y$counts)
head(cpm(y))
apply(y$counts, 1, sum) # total gene counts per sample
keep <- rowSums(cpm(y)>100) >= 1
y <- y[keep,]
dim(y)
[1] 3302 27
y$samples$lib.size <- colSums(y$counts)
y$samples
y <- calcNormFactors(y)
y
plotMDS(y, method="bcv", col=as.numeric(y$samples$group))
legend("bottomleft", as.character(unique(y$samples$group)), col=1:3, pch=20)
design <- model.matrix(~0+time, data = y$samples)
colnames(design) <- levels(time)
rownames(design) <- colnames(y)
design
0h 1h 24h 2h 6h
Baseline.0h_1 1 0 0 0 0
DMSO.1h_1 0 1 0 0 0
Tg.1h_1 0 1 0 0 0
DMSO.2h_1 0 0 0 1 0
Tg.2h_1 0 0 0 1 0
DMSO.6h_1 0 0 0 0 1
Tg.6h_1 0 0 0 0 1
DMSO.24h_1 0 0 1 0 0
Tg.24h_1 0 0 1 0 0
Baseline.0h_2 1 0 0 0 0
DMSO.1h_2 0 1 0 0 0
Tg.1h_2 0 1 0 0 0
DMSO.2h_2 0 0 0 1 0
Tg.2h_2 0 0 0 1 0
DMSO.6h_2 0 0 0 0 1
Tg.6h_2 0 0 0 0 1
DMSO.24h_2 0 0 1 0 0
Tg.24h_2 0 0 1 0 0
Baseline.0h_3 1 0 0 0 0
DMSO.1h_3 0 1 0 0 0
Tg.1h_3 0 1 0 0 0
DMSO.2h_3 0 0 0 1 0
Tg.2h_3 0 0 0 1 0
DMSO.6h_3 0 0 0 0 1
Tg.6h_3 0 0 0 0 1
DMSO.24h_3 0 0 1 0 0
Tg.24h_3 0 0 1 0 0
attr(,"assign")
[1] 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$time
[1] "contr.treatment"
z <- estimateDisp(y,design, robust = TRUE)
sqrt(z$common.dispersion)
plotBCV(z)
plotMDS(z, labels = time)
fit <- glmQLFit(z,design, robust = TRUE)
plotQLDisp(fit)
qlf <- glmQLFTest(fit,coef=1:5)
summary(decideTests(qlf))
6h-2h-24h-1h-0h
NotSig 0
Sig 3302
topTags(qlf, n=1000000)
tab2 <- as.data.frame(topTags(qlf, n=1000000))
Coefficient: 0h 1h 24h 2h 6h
genes logFC.0h logFC.1h logFC.24h logFC.2h logFC.6h logCPM F
Ints7 NA -12.26534 -12.27342 -12.35646 -12.28617 -12.25947 7.641854 43991.27
Cs NA -11.58730 -11.55527 -11.53684 -11.59889 -11.56937 8.364561 42149.50
Yme1l1 NA -11.34767 -11.45401 -11.35738 -11.39468 -11.41588 8.533332 40818.42
Elavl1 NA -11.65959 -11.65544 -11.69609 -11.68613 -11.60697 8.271334 40671.67
Pdzd8 NA -11.21080 -11.16845 -11.08415 -11.22524 -11.15235 8.769288 40341.18
Rab14 NA -11.63948 -11.69195 -11.65178 -11.68748 -11.62025 8.271978 39723.03
Ppp6r3 NA -11.48140 -11.49594 -11.50953 -11.50732 -11.40042 8.453886 38766.01
Rnf14 NA -11.90126 -11.87093 -11.85145 -11.84206 -11.89443 8.063408 37941.80
Rabl6 NA -12.42685 -12.48067 -12.42238 -12.39573 -12.36443 7.515743 37101.46
Fam126b NA -12.78592 -12.73044 -12.72118 -12.61550 -12.75967 7.217930 36670.18
Csnk2a1 NA -11.58646 -11.56253 -11.53919 -11.58012 -11.58708 8.362689 36437.87
Ube2i NA -11.95506 -11.96058 -11.93990 -12.06529 -12.02329 7.940364 36216.62
sessionInfo( )
R version 3.6.3 (2020-02-29)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] RColorBrewer_1.1-2 edgeR_3.28.1 limma_3.42.2
loaded via a namespace (and not attached):
[1] compiler_3.6.3 tools_3.6.3 Rcpp_1.0.7 splines_3.6.3 grid_3.6.3
[6] locfit_1.5-9.4 statmod_1.4.36 lattice_0.20-41
Thanks Gordon Smyth , that helped. Also how do I put 24h at the end? I want to plot the observed and fitted values for each gene across timepoints as shown in section 4.8 of the manual. Should I use relevel
time <- relevel(time, ref="0h")
?The order of the factor levels makes no difference to the analysis. But you can order the time points if you wish by setting
See
?factor
. Usingrelevel
will do nothing because0h
is already the first level.