Entering edit mode
Hi guys,
I would you to revise my edgeR code since it's possible I missed
something
important because being quite new on this.
Thanks, Bernardo
P.S. This questions has also been posted in Biostar forum:
https://www.biostars.org/p/99849/
############################################################
#htseq-count stats
############################################################
# rRNA and tRNA will be discarded in counts file because the arbitrary
mapped reads to these regions
# NOTE: minimun alignment quality is set to 10
# '-t CDS -i ID' will exclude rRNA and tRNA. Also 'Parent' will give
the
correct locus tag name to each 'feature' in count table.
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i
ID
14.sam HS372.gff > 14.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i
ID
15.sam HS372.gff > 15.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i
ID
16.sam HS372.gff > 16.counts
#Last lines from .counts files
#14.counts
__no_feature 271363
__ambiguous 6
__too_low_aQual 137308
__not_aligned 133836
__alignment_not_unique 0
#15.counts
__no_feature 346247
__ambiguous 3
__too_low_aQual 148014
__not_aligned 135920
__alignment_not_unique 0
#16.counts
__no_feature 1067474
__ambiguous 0
__too_low_aQual 136484
__not_aligned 110488
__alignment_not_unique 0
############################################################
#edgeR
############################################################
#R code
library(edgeR)
library(WriteXLS)
dir () # The tag counts for the two individual libraries are stored in
two
separate plain text files. In each file, the tag IDs and counts for
each
tag are provided in a table
targets <- read.delim("targets.txt", stringsAsFactors = FALSE)
targets
# files group description
#1 14.counts biofilm biofilm F9
#2 15.counts planktonic planktonic F9
#3 16.counts stationary stationary F9
RG <- readDGE(targets)
colnames(RG) <- c("biofilm","planktonic","stationary")
RG
dim(RG)
#filter low expressed transcripts
keep <- rowSums(cpm(RG)>1) > 1 #we keep genes that achieve at least
one
count per million (cpm) in at least TWO samples
RG <- RG[keep,]
dim(RG)
RG$samples$lib.size <- colSums(RG$counts) # After filtering, it is a
good
idea to reset the library sizes:
RG$samples
#Normalization
RG <- calcNormFactors(RG) # Apply TMM normalization
RG$samples # se manual
RG
############################################################
#Bio_vs_Plank
############################################################
bcv <- 0.2 # Assume a low BCV value of 0.2. The BCV (square root of
the
common dispersion) here is 20%, stilgthly higher than a typical size
for a
laboratory experiment with a cell line or a model organism.
et <- exactTest(RG, pair=c("planktonic","biofilm"),dispersion=bcv^2) #
exactTest. dispersion = 0.04
et
class(et)
top <- topTags(et) # Top ten differentially expressed tags with their
p-values
top
class(top)
cpm(RG)[rownames(top), ] # Check the individual cpm values for the top
ten
genes
summary(de <- decideTestsDGE(et, p=0.05, adjust="BH")) # The total
number
of differentiallly expressed genes at FDR< 0.05
# Here the entries for -1, 0 and 1 are for down-regulated,
non-differentially expressed and up-regulated tags respectively.
#-1 54
#0 2153
#1 52
detags <- rownames(RG)[as.logical(de)] # detags contains the DE genes
at 5%
FDR
head (detags)
plotSmear(et, de.tags=detags, ylim=c(-7,7), xlim=c(0,15), cex.lab=1.4,
cex.axis=1) # plot all genes and highlight DE genes at 5% FDR
abline(h=c(-1, 1), col="blue") # The blue lines indicate 2-fold
changes.
title("Biofilm vs planktonic")
dev.copy2pdf(file = "Figure_1.pdf") #Save as .pdf##
# NOTE -> adjust 'n' depending on the available number of genes
Bio_vs_Plank <- topTags(et, n=2259, adjust.method="BH",
sort.by="logFC")
# export excel file
x <- Bio_vs_Plank$table
WriteXLS("x","Bio_vs_Plank.xls", row.names = TRUE, col.names = TRUE)
# export genes for TopGO. DEG_up_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC > 0 & PValue < 0.05)
x.sub
DEG_up_Pvalue_005 <- rownames(x.sub)
write.table(DEG_up_Pvalue_005, "Bio_vs_Plank_DEG_up_pvalue_005.txt",
quote
= FALSE, row.names = FALSE, col.names = FALSE)
# export genes for TopGO. DEG_down_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC < 0 & PValue < 0.05)
x.sub
DEG_down_Pvalue_005 <- rownames(x.sub)
write.table(DEG_down_Pvalue_005,
"Bio_vs_Plank_DEG_down_pvalue_005.txt",
quote = FALSE, row.names = FALSE, col.names = FALSE)
# export reference gene set
reference_set <- rownames(RG$counts)
write.table(reference_set, "reference_set.txt", quote = FALSE,
row.names =
FALSE, col.names = FALSE)
--
*Bernardo Bello OrtÃ*
PhD student
CReSA-IRTA
Campus de Bellaterra-Universitat Autònoma de Barcelona
Edifici CReSA
08193 Bellaterra (Barcelona, Spain)
Tel.: 647 42 52 63 *www.cresa.es <http: www.cresa.es=""> *
[[alternative HTML version deleted]]