Question

Deseq2 for RNAseq experiments without replicates

0

Entering edit mode

prp291 • 0

@prp291-9622

Last seen 6.2 years ago

I am trying to find out the differentially expressed gene in a RNAseq experiment without replicate. The first R script is inspired by this post on seqanswer and other script I always use for my other experiments with replicates. When I run first script I got 800 genes while second script is giving me the more than 3000 genes. Can you tell me that which is script should I use?

In first script I am getting the avgLogExpr and rLogFC . Can you tell me please that which one should be considered.

First script

# This R script will analyzed data when there is no replicate
# There script is inspired by this post
#http://seqanswers.com/forums/showthread.php?t=31036

#install.packages("DESeq2")
## try http:// if https:// URLs are not supported
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")

# Set working directory
setwd("/Users/sanjay/Desktop/deseq");
getwd();

# Counts-Matrix
raw_count <- read.delim("count.txt", header=TRUE, row.names=1)
countdata1 <- round(raw_count)

#write.csv(countdata1, "final_filtered_data - Copy - Copy.csv")
head(countdata1)

# Remove all gene which has 0 value in all sample
all <- apply(countdata1, 1, function(x) all(x==0) )
newdata <- countdata1[!all,]
write.csv(newdata, file = "count_0_filter.csv")
head (newdata)

# remove uninformative genes keep only genes that are expressed in at least 200 count in 20 samples
dat <- newdata[rowSums(newdata > 20) >= 1,]

write.csv(dat, file = "rpkm.csv_0_filter_atleat_200_count.csv")
head (dat)

final_count <- read.csv("rpkm.csv_0_filter_atleat_200_count.csv", header=TRUE, row.names=1)

library(DESeq2)

# Convert to matrix
countdata <- as.matrix(final_count)
head(countdata)
(condition <- factor(c("control", "1h")))

(condition <- factor(c("control", "1h")))
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))

dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds
#set control as reference level
dds$condition <- relevel(dds$condition, "control")
dds <- DESeq(dds)

# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
write.csv(resdata, file="rld.csv")

res <- data.frame(
assay(rld),
avgLogExpr = ( assay(rld)[,2] + assay(rld)[,1] ) / 2,
rLogFC = assay(rld)[,2] - assay(rld)[,1] )

head( res[ order(res$rLogFC), ] )
write.csv(res, file="DEG.xlsx")

Second script

# This R script will analyzed data when there is no replicate

#install.packages("DESeq2")
## try http:// if https:// URLs are not supported
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")

# Set working directory
#setwd("/Users/Sanjay/Desktop/mirna");
#getwd();

# Set working directory
setwd("/Users/Sanjay/Desktop/deseq");
getwd();

# Counts-Matrix
raw_count <- read.delim("count.txt", header=TRUE, row.names=1)
countdata1 <- round(raw_count)
head(countdata1)

# Remove all gene which has 0 value in all sample
all <- apply(countdata1, 1, function(x) all(x==0) )
newdata <- countdata1[!all,]
write.csv(newdata, file = "count_0_filter.csv")
head (newdata)

# remove uninformative genes keep only genes that are expressed in at least 50 count in 2 samples
dat <- newdata[rowSums(newdata > 50) >= 2,]
write.csv(dat, file = "rpkm.csv_0_filter_atleat_50_count.csv")
head (dat)

final_count <- read.csv("rpkm.csv_0_filter_atleat_50_count.csv", header=TRUE, row.names=1)

library(DESeq2)

# Convert to matrix
countdata <- as.matrix(final_count)
head(countdata)
(condition <- factor(c("NJ8", "NJ9")))

library(DESeq2)

# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))

dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds

#set NJ8 as reference level
dds$condition <- relevel(dds$condition, "NJ8")
dds <- DESeq(dds)
DESeq(dds)

# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
write.csv(resdata, file="rld.csv")

# Get differential expression results
res <- results(dds)
## Order by adjusted fold change
res <- res[order(res$log2FoldChange), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Write results
write.csv(resdata, file="diffexpr-results.csv")

## filter result by log2foldchange i.e. set to 1
dat.log2FC.twofold = res[abs(res$log2FoldChange) >= 1.0,]
write.csv(dat.log2FC.twofold, file="diffexpr-results_filter.csv")

## Volcano plot with "significant" genes labeled
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
dev.off()

rnaseq deseq2 • 12k views

ADD COMMENT • link updated 8.2 years ago by Michael Love 41k • written 8.2 years ago by prp291 • 0

score 1 · Answer 1 · 2016-01-29

For an experiment without replicates, you should just run DESeq() as normal.

I can't easily see the difference between the two scripts you posted. And make sure to read the section of the help file ?DESeq which starts "Experiments without replicates...". It's important to remember that this kind of analysis is solely for exploratory purposes, and that biological replicates are needed for a proper analysis.