I have eleven viable samples (there were supposed to be 3 replicates of each). I am trying to get samples MOG3, MOG4, MOG5 to cluster together and also MOGSP2 and MOGSP_3. I realise that the latter group is missing a replicate so I used dropEmptyLevels but the pheatmap still doesn't cluster the MOG samples together. How do you suggest I amend this issue? Below is my code. I would be truly appreciative of any feedback. I have tremendous respect for the community and its support.
#load count matrix
count <- read.csv("~/Documents/DrKwan/Workshop/InputforDESeq.csv")
#create data frame
countData <- as.data.frame(count)
countData <- countData[!duplicated(countData[,c("Gene_Symbol")]),]
row.names(countData) <- countData$Gene_Symbol
countData <- countData[,c(7:17)]
#create column data
colData <- DataFrame(
cell = c("SPtetramerCD8TCell", "SPtetramerCD8TCell", "SPtetramerCD8TCell", "SPtetramerCD8TCell", "SPtetramerCD8TCell", "Ly49NCD8TCell", "Ly49PCD8TCell", "Ly49NCD8TCell", "Ly49PCD8TCell", "Ly49NCD8TCell", "Ly49PCD8TCell"),
treatment = c("MOGSP", "MOGSP", "MOG", "MOG", "MOG", "MOGSP", "MOGSP", "MOGSP", "MOGSP", "MOGSP", "MOGSP"),
row.names = colnames(countData)
)
colData
#DESeq2
dds <- DESeqDataSetFromMatrix(countData=countData,
colData=colData,
design= ~ treatment + cell)
rld <- rlog(dds, blind = FALSE)
vsd <- vst(dds, blind = FALSE)
dds <- estimateSizeFactors(dds)
dds$cell <- dropEmptyLevels(dds$cell)
dds$treatment <- dropEmptyLevels(dds$treatment)
slog <- log2(counts(dds, normalized=TRUE)+1)
#remove genes with low counts
keepCounts <- rowSums(counts(dds)) >= 10
dds <- dds[keepCounts,]
#DESeq2
dds <- DESeq(dds)
res <- results(dds)
res
#p-values and adjusted p-values
resOrdered <- res[order(res$pvalue),]
summary(res)
#Benjamini-Hochberg two tailed adjusted P <0.005 from Wald's test
#Log2 Fold Change >0.75
res005 <- results(dds, alpha=0.005, lfcThreshold = 0.75)
summary(res005)
sum(res005$padj < 0.005, na.rm=TRUE)
#more information on results columns
mcols(res)$description
#multi-factor designs
#create a copy of the DESeqDataSet so that we can rerun the analysis using a multi-factor design
ddsMF <- dds
#change the levels of "cell"
levels(ddsMF$cell)
levels(ddsMF$cell) <- sub("-.*", "", levels(ddsMF$cell))
levels(ddsMF$cell)
#rerun DESeq with "treatment"
design(ddsMF) <- formula(~ treatment + cell)
ddsMF <- DESeq(ddsMF)
resMF <- results(ddsMF)
head(resMF)
#multiType Results
resMFType <- results(ddsMF,
contrast=c("cell", "SPtetramerCD8TCell", "Ly49NCD8TCell"))
head(resMFType)
#If the variable is continuous or an interaction term
#then the results can be extracted using the `name` argument to *results*,
#where the name is one of elements returned by `resultsNames(dds)`.
#transform count data
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
#Wald test individual steps
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
#Interactions
#If the comparisons of interest are, for example, the effect of a condition for different sets of samples, a simpler approach than adding interaction terms explicitly to the design formula is to perform the following steps:
##combine the factors of interest into a single factor with all combinations of the original factors
##change the design to include just this factor, e.g. ~ group
dds$group <- factor(paste0(dds$treatment, dds$cell))
design(dds) <- ~ group
dds <- DESeq(dds)
resultsNames(dds)
resCellTreatment <- results(dds, contrast = c("group", "SPtetramerCD8TCell", "Ly49NCD8TCell"))
#remove genes with low counts
keepCounts <- rowSums(counts(dds)) >= 10
dds <- dds[keepCounts,]
#DESeq2
dds <- DESeq(dds)
resCellTreatment <- results(dds)
resCellTreatment
#p-values and adjusted p-values
resOrderedCellTreatment <- resCellTreatment[order(resCellTreatment$pvalue),]
summary(resCellTreatment)
#Benjamini-Hochberg two tailed adjusted P <0.005 from Wald's test
#Log2 Fold Change >0.75
res005CellTreatment <- results(dds, alpha=0.005, lfcThreshold = 0.75)
summary(res005CellTreatment)
sum(res005CellTreatment$padj < 0.005, na.rm=TRUE)
#more information on results columns
mcols(res)$description
df <- bind_rows(
as_data_frame(slog[,1:2]) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
colnames(df)[1:2] <- c("x", "y")
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
dist2 <- function(x, ...) # distance function = 1-PCC (Pearson's correlation coefficient)
as.dist(1-cor(t(x), method="pearson"))
fit = cmdscale( dist2(t(assay(rld))) , eig=T, k=2)
mdsData <- as.data.frame(fit$points[,1:2]);
mdsData <- cbind(mdsData,detectGroups(colnames(assay(rld))) )
colnames(mdsData) = c("x1", "x2", "Type")
library(gplots)
hclust2 <- function(x, method="ward.D2", ...) # average linkage in hierarchical clustering
hclust(x, method=method, ...)
n=100 # number of top genes by standard deviation
x = assay(rld)
if(n>dim(x)[1]) n = dim(x)[1] # max as data
x = x[order(apply(x,1,sd),decreasing=TRUE),] # sort genes by standard deviation
x = x[1:n,] # only keep the n genes
# this will cutoff very large values, which could skew the color
x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
cutoff = median(unlist(x)) + 4*sd (unlist(x))
x[x>cutoff] <- cutoff
cutoff = median(unlist(x)) - 4*sd (unlist(x))
x[x< cutoff] <- cutoff
groups = detectGroups(colnames(x) )
groups.colors = rainbow(length(unique(groups) ) )
#pretty heat map
pheatmap(x, fontsize_row = 3)
Dear Dr. Kevin,
Thank you for your feedback. I took your advice and tried to clean up my code which was too convoluted because I was trying to get all bits and parts from the vignette. I removed slog as suggested. Is the expression values as they should be now as x = assay(rld)?
Please accept my apologies.
Hey again, I am not sure about what you would like me to comment. The underlying issue is that you expected that your sample groups would segregate via clustering by simply selecting the top 100 genes based on standard deviation, which is not a realistic expectation to have.
You may need to clarify what is the aim of your work (not necessarily by replying to this thread). For example, you have
treatment
andcell
, but what is the hypothesis here?You already performed a differential expression analysis via the following command:
; however, you are running this command 'blind' and not specifying which groups you want to compare.
There is also some redundancy in your code, because you do not have to run these commands:
These three commands are automatically invoked via
DESeq()
, which you have already run - see HERE.You also load gplots but never use it, and define
hclust2
anddist2
, but never use these.It may help to take a look at the Quick Start, followed by Contrasts, followed by Extracting transformed values.
Usually, after we derive differentially expressed genes, we then subset our expression matrix prior to clustering. In this case, yes, we would use
x = assay(rld)
orx = assay(vsd)
, and then subsetx
for the differentially expressed genes before runningpheatmap()
.Trust that this helps.
My main issue with the analysis is that you are trying to make the samples cluster together. That is not how clustering works. If it does not cluster together, then maybe that is the reality.
That being said, you should subset your count matrix (after
rlog
orvst
transformation) for the differentialy expressed genes for plotting the heatmap.