I am trying to find out the differentially expressed gene in a RNAseq experiment without replicate. The first R script is inspired by this post on seqanswer and other script I always use for my other experiments with replicates. When I run first script I got 800 genes while second script is giving me the more than 3000 genes. Can you tell me that which is script should I use?
In first script I am getting the avgLogExpr and rLogFC . Can you tell me please that which one should be considered.
First script
# This R script will analyzed data when there is no replicate
# There script is inspired by this post
#http://seqanswers.com/forums/showthread.php?t=31036
#install.packages("DESeq2")
## try http:// if https:// URLs are not supported
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
# Set working directory
setwd("/Users/sanjay/Desktop/deseq");
getwd();
# Counts-Matrix
raw_count <- read.delim("count.txt", header=TRUE, row.names=1)
countdata1 <- round(raw_count)
#write.csv(countdata1, "final_filtered_data - Copy - Copy.csv")
head(countdata1)
# Remove all gene which has 0 value in all sample
all <- apply(countdata1, 1, function(x) all(x==0) )
newdata <- countdata1[!all,]
write.csv(newdata, file = "count_0_filter.csv")
head (newdata)
# remove uninformative genes keep only genes that are expressed in at least 200 count in 20 samples
dat <- newdata[rowSums(newdata > 20) >= 1,]
write.csv(dat, file = "rpkm.csv_0_filter_atleat_200_count.csv")
head (dat)
final_count <- read.csv("rpkm.csv_0_filter_atleat_200_count.csv", header=TRUE, row.names=1)
library(DESeq2)
# Convert to matrix
countdata <- as.matrix(final_count)
head(countdata)
(condition <- factor(c("control", "1h")))
(condition <- factor(c("control", "1h")))
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds
#set control as reference level
dds$condition <- relevel(dds$condition, "control")
dds <- DESeq(dds)
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
write.csv(resdata, file="rld.csv")
res <- data.frame(
assay(rld),
avgLogExpr = ( assay(rld)[,2] + assay(rld)[,1] ) / 2,
rLogFC = assay(rld)[,2] - assay(rld)[,1] )
head( res[ order(res$rLogFC), ] )
write.csv(res, file="DEG.xlsx")
Second script
# This R script will analyzed data when there is no replicate
#install.packages("DESeq2")
## try http:// if https:// URLs are not supported
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
# Set working directory
#setwd("/Users/Sanjay/Desktop/mirna");
#getwd();
# Set working directory
setwd("/Users/Sanjay/Desktop/deseq");
getwd();
# Counts-Matrix
raw_count <- read.delim("count.txt", header=TRUE, row.names=1)
countdata1 <- round(raw_count)
head(countdata1)
# Remove all gene which has 0 value in all sample
all <- apply(countdata1, 1, function(x) all(x==0) )
newdata <- countdata1[!all,]
write.csv(newdata, file = "count_0_filter.csv")
head (newdata)
# remove uninformative genes keep only genes that are expressed in at least 50 count in 2 samples
dat <- newdata[rowSums(newdata > 50) >= 2,]
write.csv(dat, file = "rpkm.csv_0_filter_atleat_50_count.csv")
head (dat)
final_count <- read.csv("rpkm.csv_0_filter_atleat_50_count.csv", header=TRUE, row.names=1)
library(DESeq2)
# Convert to matrix
countdata <- as.matrix(final_count)
head(countdata)
(condition <- factor(c("NJ8", "NJ9")))
library(DESeq2)
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds
#set NJ8 as reference level
dds$condition <- relevel(dds$condition, "NJ8")
dds <- DESeq(dds)
DESeq(dds)
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
write.csv(resdata, file="rld.csv")
# Get differential expression results
res <- results(dds)
## Order by adjusted fold change
res <- res[order(res$log2FoldChange), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Write results
write.csv(resdata, file="diffexpr-results.csv")
## filter result by log2foldchange i.e. set to 1
dat.log2FC.twofold = res[abs(res$log2FoldChange) >= 1.0,]
write.csv(dat.log2FC.twofold, file="diffexpr-results_filter.csv")
## Volcano plot with "significant" genes labeled
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
dev.off()