I can't create a heatmap for the top 10 differentially expressed genes. I tried following other examples online (like the one in the DESeq2 workflow), but non seem to work with my data. Does anyone have advise on how to plot a heatmap?
Here is the error message:
> DEgenes <- mat[idx,] #subset mtx by only the set of DE gns abv
Error in mat[idx, ] : subscript out of bounds
Here is the code causing the error:
#BiocManager::install("DESeq2")
library(DESeq2)
#BiocManager::install("EnsDb.Hsapiens.v86")
library(EnsDb.Hsapiens.v86)
#BiocManager::install("EnhancedVolcano")
library(EnhancedVolcano)
#BiocManager::install("pheatmap")
library(pheatmap)
#BiocManager::install("org.Hs.eg.db")
library(org.Hs.eg.db)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("recount")
library(recount)
#install.packages("RColorBrewer")
library(RColorBrewer)
#install.packages("DT")
library(DT)
#list of articles to extract data from
project_info <- abstract_search(query = "Ewing sarcoma")
# View the available projects and select one
selected_study <- "SRP028344" #"Gene regulation by EWS/ERG in Ewing sarcoma."
# Download the study data
download_study(selected_study) #creates an object with a class: RangedSummarizedExperiment
#load the data by specifying sub-folder
load("SRP028344/rse_gene.Rdata")
# Examine the read count matrix
cts <- assay(rse_gene) #gn counts within each smpl
#View(cts)
#editing condition column/colData
rse_gene$condition <- c(rep("untrtd", 3), rep("trtd",4))
#sample metadata (colData) - shows condition coloumn that was added above
cd <- as.data.frame(colData(rse_gene))
View(cd)
#confirm that row names in the colData (cd) matches the column names in cts (the assay of the data)
all(colnames(cts) %in% rownames(cd)) #TRUE
#confirm they have the same order
all(colnames(cts) == rownames(cd)) #TRUE
#!!!!!!!!!!!!!!!!!!!!!!!
#CREATE THE DESEQ OBJECT
#!!!!!!!!!!!!!!!!!!!!!!!
#make the dds
dds <- DESeqDataSet(rse_gene, design = ~condition)
#set the factor level
#compare the other levels to the untreated sample
dds$condition <- relevel(dds$condition, ref = "untrtd")
#creat DESe2 object
dds <- DESeq(dds)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#QUALITY CONTROL: RMV LOW GN CNTS
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#remove rows with low gene counts
keep <- as.data.frame(counts(dds)) %>%
rowSums(counts(dds, normalized=TRUE)) >= 10
dds <- dds[keep,]
#!!!!!!!!!!!!!!!!!!!!!!!!
#QUALITY CONTROL: NRMLZTN
#!!!!!!!!!!!!!!!!!!!!!!!!
#calculate normalized counts
ddsNorm <- estimateSizeFactors(dds)
#view the size factors used for normalization
sizeFactors(ddsNorm)
#extract normalized counts
normalizedCounts <- counts(ddsNorm, normalized = TRUE)
#view(normalizedCounts)
#variance stabilizing transformation
ddsNormVst <- vst(ddsNorm, blind = TRUE)
#extract VST-transformed nrmlzd cnts as a mtx from the vsd bjct [use assay()]
mtx_vst_ddsNorm<- assay(ddsNormVst)
#compute the prws crltn vlus btwn each pair of samples using the cor() fx
cor_mtx_vst_ddsNorm <- cor(mtx_vst_ddsNorm)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#QUALITY CONTROL: PCA PLOT, CLSTRNG HTMP
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#create rlog for PCA
rld <- rlog(dds)
#plot the PCA
plotPCA(rld)
#!!!!!!!!!!!!!!!
#summary/results
#!!!!!!!!!!!!!!!
#extract the results
res <- results(dds,
contrast = c("condition", #condition factor
"trtd", #lvl/fctr to cmpr
"untrtd"), #base level
alpha = 0.05)
#confirm the name and number of the coefficients
resultsNames(dds) # lists the coefficients
summary(res)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#VISUALIZING RESULTS: VOLCANO PLOTS
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#shrink data
resNorm <- lfcShrink(dds = dds,
res = res,
type = "normal",
coef = 2)
#make a dataframe of the results to view them
resdf <- as.data.frame(resNorm)
#extract gene symbol from EnsDb.Hsapiens.v86
ens2sym <- AnnotationDbi::select(EnsDb.Hsapiens.v86,
keys = keys(EnsDb.Hsapiens.v86), #this code shows that ur selecting from this database (EnsD, using the key(EnsDb) to attach the symbol)
columns = c("SYMBOL"))
#join ens2sym to resdf by shared column (GENEID)
#want to add the symbol coloum in ens2sym to resdf
resdfsym <- resdf %>%
rownames_to_column() %>%
mutate(GENEID = gsub(rowname, pattern = "\\..+", replacement = "")) %>% #rmvng the version at the end of the rowname
inner_join(y = ens2sym, by = "GENEID") %>% #only keeps geneID/symbol found in both dfs
dplyr::select(-rowname)
#create the volcano plot
EnhancedVolcano(resdfsym, lab = resdfsym$SYMBOL, pCutoff = 1e-100,
FCcutoff = 3,
x = "log2FoldChange",
y = "padj")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#TOP-10 OVER EXPRESSED DEGS HEATMAP
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#first, #subset nrmlzd counts to sgnfcnt genes
#800 less genes than resdfsym
sigres <- resdfsym[ #subset nrmlzd counts to sgnfcnt genes
which(resdfsym$padj < 0.01 & abs(resdfsym$log2FoldChange) >= 1 &
resdfsym$baseMean >= 20), ]
sigres <- resdfsym %>%
dplyr::select(resdfsym$padj < 0.01)
sigres
#get normalized counts from dds object
#only want counts with matching geneIDs to our significant data frame
counts(dds, normalized = TRUE)[]
#!!!!!!!!
slct <- order(rowMeans(
counts(dds,
normalized = TRUE)),
decreasing = TRUE) [1:20]
df <- as.data.frame(colData(dds)[,c("condition","type")])
df <- as.data.frame(colData(dds)[,c("condition"), "type"])
rownames(df) <- colnames(rld)
pheatmap(assay(rld)[slct,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
mat <- assay(rld) #extract mtx of nrmlzd counts
idx <- sigres$ENSEMBL #extract geneID of DE gns
DEgenes <- mat[idx,] #subset mtx by only the set of DE gns abv
rld <- na.omit(rld)
annotation <- as.data.frame(colData(rld)[, c("run","condition")]) #xtrct some dat from rld cndtns
pheatmap(DEgenes, scale = "row", show_rownames = FALSE, clustering_distance_rows = "correlation", annotation_col = annotation, main="Differentially Expressed genes")
#!!!!!!!!!!!!!!!!!
## add gene annotation to results table
# subset genes
# plot heatmap of top 20 DE genes
orderedSig <- resdfsym[order(resdfsym$padj), ] #top 20 gns r the gns w lowest adjusted p value -> most sgnfcntly dfly xprsd
#can also extract top 20 gns w lrgst or smllst log2fc
id1 <- orderedSig$GENEID #can add names since it's just top 20
id2 <- orderedSig$SYMBOL #this is the name
topDE <- mat[id1,]
rownames(topDE) <- id2 #rmv ensemble iDs from row name
top20DE <- head(topDE, n=20) #take top 20 gns
#plot data now
pheatmap(top20DE, scale = "row", clustering_distance_rows = "correlation", annotation_col = annotation, main="Top 20 Differentially Expressed genes")