I am analyzing a time course RNA-seq dataset (3 replicates) with 5 timepoints (0h, 1h, 2h, 6h, 24h) and 2 treatment conditions (drug1 and drug2). I want to find out the changes in gene expression patterns across 0-24 hours upon drug treatment. Below is the code I used. My questions:
Why are the logFC values all negative? Am I doing something wrong?
library(edgeR)
library(limma)
x <- read.csv("sample_matrix_rawcount_Tg.csv", header=T, stringsAsFactors=FALSE, row.names=1)
x <- data.matrix(x)
x
                      Baseline.0h_1 DMSO.1h_1 Tg.1h_1 DMSO.2h_1 Tg.2h_1 DMSO.6h_1 Tg.6h_1
Xkr4                          680       650     751       783     542       886    1183
Rp1                             2         9       8        22      22         0       1
Sox17                          25        40      42        11      53         3       1
...
                   DMSO.24h_1 Tg.24h_1 Baseline.0h_2 DMSO.1h_2 Tg.1h_2 DMSO.2h_2 Tg.2h_2
Xkr4                      1227     1885          1021      1000     477       706     505
Rp1                          0        0             0         0      29         0      21
Sox17                        0        9             0         0      73         0      14
...
                    DMSO.6h_2 Tg.6h_2 DMSO.24h_2 Tg.24h_2 Baseline.0h_3 DMSO.1h_3 Tg.1h_3
Xkr4                     1071     337        990     1287           930       998     754
Rp1                         1      13          0        0             0         0       3
Sox17                       1      67          0        0             0         0      44
...
                    DMSO.2h_3 Tg.2h_3 DMSO.6h_3 Tg.6h_3 DMSO.24h_3 Tg.24h_3
Xkr4                      784     566      1125     365       1088     1585
Rp1                         0      28         0      16          0        0
Sox17                       0      73         0      31          0        0
time<-factor(c("0h","1h", "1h",
               "2h", "2h",
               "6h", "6h",
               "24h","24h",
               "0h", "1h", "1h",
               "2h", "2h",
               "6h", "6h",
               "24h","24h",
               "0h","1h", "1h",
               "2h", "2h",
               "6h", "6h",
               "24h","24h"))
y <- DGEList(counts=x[,1:27], genes = x[row.names(x)], group = time) #read counts.csv and make table 
y$samples
levels(y$samples$group)
dim(y)
   [1] 24421    27
head(y$counts)
head(cpm(y))
apply(y$counts, 1, sum) # total gene counts per sample
keep <- rowSums(cpm(y)>100) >= 1
y <- y[keep,]
dim(y)
     [1] 3302   27
y$samples$lib.size <- colSums(y$counts)
y$samples
y <- calcNormFactors(y)
y
plotMDS(y, method="bcv", col=as.numeric(y$samples$group))
legend("bottomleft", as.character(unique(y$samples$group)), col=1:3, pch=20)
design <- model.matrix(~0+time, data = y$samples) 
colnames(design) <- levels(time)
rownames(design) <- colnames(y)
design
                0h 1h 24h 2h 6h
Baseline.0h_1  1  0   0  0  0
DMSO.1h_1      0  1   0  0  0
Tg.1h_1        0  1   0  0  0
DMSO.2h_1      0  0   0  1  0
Tg.2h_1        0  0   0  1  0
DMSO.6h_1      0  0   0  0  1
Tg.6h_1        0  0   0  0  1
DMSO.24h_1     0  0   1  0  0
Tg.24h_1       0  0   1  0  0
Baseline.0h_2  1  0   0  0  0
DMSO.1h_2      0  1   0  0  0
Tg.1h_2        0  1   0  0  0
DMSO.2h_2      0  0   0  1  0
Tg.2h_2        0  0   0  1  0
DMSO.6h_2      0  0   0  0  1
Tg.6h_2        0  0   0  0  1
DMSO.24h_2     0  0   1  0  0
Tg.24h_2       0  0   1  0  0
Baseline.0h_3  1  0   0  0  0
DMSO.1h_3      0  1   0  0  0
Tg.1h_3        0  1   0  0  0
DMSO.2h_3      0  0   0  1  0
Tg.2h_3        0  0   0  1  0
DMSO.6h_3      0  0   0  0  1
Tg.6h_3        0  0   0  0  1
DMSO.24h_3     0  0   1  0  0
Tg.24h_3       0  0   1  0  0
attr(,"assign")
[1] 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$time
[1] "contr.treatment"
z <- estimateDisp(y,design, robust = TRUE)
sqrt(z$common.dispersion)
plotBCV(z)
plotMDS(z, labels = time)
fit <- glmQLFit(z,design, robust = TRUE)
plotQLDisp(fit)
qlf <- glmQLFTest(fit,coef=1:5)
summary(decideTests(qlf))
6h-2h-24h-1h-0h
NotSig               0
Sig               3302
topTags(qlf, n=1000000)
tab2 <- as.data.frame(topTags(qlf, n=1000000)) 
Coefficient:  0h 1h 24h 2h 6h 
              genes  logFC.0h  logFC.1h logFC.24h  logFC.2h  logFC.6h   logCPM        F
Ints7            NA -12.26534 -12.27342 -12.35646 -12.28617 -12.25947 7.641854 43991.27
Cs               NA -11.58730 -11.55527 -11.53684 -11.59889 -11.56937 8.364561 42149.50
Yme1l1           NA -11.34767 -11.45401 -11.35738 -11.39468 -11.41588 8.533332 40818.42
Elavl1           NA -11.65959 -11.65544 -11.69609 -11.68613 -11.60697 8.271334 40671.67
Pdzd8            NA -11.21080 -11.16845 -11.08415 -11.22524 -11.15235 8.769288 40341.18
Rab14            NA -11.63948 -11.69195 -11.65178 -11.68748 -11.62025 8.271978 39723.03
Ppp6r3           NA -11.48140 -11.49594 -11.50953 -11.50732 -11.40042 8.453886 38766.01
Rnf14            NA -11.90126 -11.87093 -11.85145 -11.84206 -11.89443 8.063408 37941.80
Rabl6            NA -12.42685 -12.48067 -12.42238 -12.39573 -12.36443 7.515743 37101.46
Fam126b          NA -12.78592 -12.73044 -12.72118 -12.61550 -12.75967 7.217930 36670.18
Csnk2a1          NA -11.58646 -11.56253 -11.53919 -11.58012 -11.58708 8.362689 36437.87
Ube2i            NA -11.95506 -11.96058 -11.93990 -12.06529 -12.02329 7.940364 36216.62
sessionInfo( )
R version 3.6.3 (2020-02-29)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
[1] RColorBrewer_1.1-2 edgeR_3.28.1       limma_3.42.2      
loaded via a namespace (and not attached):
[1] compiler_3.6.3  tools_3.6.3     Rcpp_1.0.7      splines_3.6.3   grid_3.6.3     
[6] locfit_1.5-9.4  statmod_1.4.36  lattice_0.20-41
                    
                
                
Thanks Gordon Smyth , that helped. Also how do I put 24h at the end? I want to plot the observed and fitted values for each gene across timepoints as shown in section 4.8 of the manual. Should I use relevel
time <- relevel(time, ref="0h")?The order of the factor levels makes no difference to the analysis. But you can order the time points if you wish by setting
See
?factor. Usingrelevelwill do nothing because0his already the first level.