Dear edgeR users,
I am not an experienced R user. I ran the edgeR for the TCGA RNAseq data using raw count from the Rsubread featureCounts and the TCGA miRNAseq data using raw count from TCGA level3 data to identify differential expressed genes, see the codes below; now I want to examine miRNA and mRNA expresion interaction, Should I use the Count per Million (CPM) from edgeR as input data for the interaction prediction?
thank you very much!!
Yuanchun Ding
---------------------------------------------------------------------------------------------------------------------------------
#loading edgeR, limma and splines;
library(edgeR)
library(limma)
library(splines)
#create treatment factor with old group as reference group and then create batchid factor
Treat <- factor(group$agegroup)
Treat <- relevel(Treat, ref="old")
batch <- factor(group$batchid)
#filter genes to include genes with at least two counts per million in at least three samples
keep=rowSums(cpm(counts) >2) >= 3
counts_final=counts[keep,]
table(keep)
#create a DGEList and apply TMM normalization
y=DGEList(counts=counts_final, group=Treat)
y=calcNormFactors(y)
#to create design matrix using batch as a block factor,so compare young to old within each batch
design=model.matrix(~batch + Treat)
rownames(design)=colnames(y)
# to estimate dispersion and plot BCV
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTrendedDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
plotBCV(y)
# to detrmine differential expresssed genes; by default, the test is for the last
# coefficient in the desgin matrix, which is the young to old treatment effect.
fit=glmFit(y, design)
lrt=glmLRT(fit)
DEGs_all=topTags(lrt,n=15223)
write.csv(DEGs_all, "DEGs_all_052715.csv")
#subset genes with FDR < 0.05 and abs(logFC)>1
DEGs_FDR05 = subset(as.data.frame(DEGs_all), FDR < 0.05)
DEGs_FDR05_logFC1=subset(DEGs_FDR05, abs(logFC)>1)
genes_FDR05=rownames(DEGs_FDR05)
write.csv(DEGs_FDR05_logFC1, "DEGs_FDR05_FC2.csv")
#summarize total nubmer of differential expressed genes at 5% FDR
dt=decideTestsDGE(lrt)
summary(dt)
#to pick out DEGs
isDE=as.logical(dt)
DEnames=rownames(y)[isDE]
# to plot all the logFCs against average count size, highlighting DEGs
plotSmear(lrt, de.tags=DEnames)
abline(h=c(-1,1), col="blue")
# to export individual counts-per-million for all 15223 genes
top15223=rownames(topTags(lrt, n=15223))
CPM_15223genes=cpm(y)[top15223,]
write.csv(CPM_15223genes_trans, "CPM_15223genes.csv")
--------------------------------------------------------------------------------------------------------------------------------------