Hi I am currently working on an amplicon sequencing dataset and using DESeq2 to identify differentially abundant taxa across my sample groups. I have a question regarding data aggregation while running the analysis. In my dataset, several ASVs belong to the same genus but are not identified at the species level (e.g., Acinetobacter sp1, Acinetobacter sp2, etc.). I am considering summing the abundances of ASVs belonging to the same genus to visualize higher-level taxonomic trends; however, I am not sure if I can do that. Would you recommend performing this type of aggregation (collapsing counts to the genus level) after running DESeq2 for visualization? Code should be placed in three backticks as shown below
ps_filtered2@otu_table@.Data <- as.matrix(ps_filtered2@otu_table@.Data)+1
dds.data = phyloseq_to_deseq2( ps_filtered2, ~Sample_type)
dds = DESeq(dds.data)
# Extract results
res <- results(dds)
res_ordered <- res[order(res$padj, na.last = NA), ]
# Visualize Differential Abundance
sig_taxa <- res_ordered[which(res_ordered$padj < 0.05), ]
sig_taxa$Taxon <- rownames(sig_taxa)
ggplot(sig_taxa, aes(x = reorder(Taxon, log2FoldChange), y = log2FoldChange, fill = log2FoldChange > 0)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Differentially Abundant Taxa", x = "Taxa", y = "Log2 Fold Change")
##
#### Correcting after considering leaves, gut and faeces in each month as a potential confounder
dds2 = phyloseq_to_deseq2( ps_filtered2, ~Month + FPOM)
dds2 = DESeq(dds2)
resultsNames(dds2)
res2 = results(dds2, contrast=c("Month", "April", "May"))
res2 = res2[order(res2$padj, na.last=NA), ]
alpha = 0.05
sigtab2 = res2[which(res2$padj < alpha), ]
sigtab2 = cbind(as(sigtab2, "data.frame"), as(tax_table(ps_filtered2)[rownames(sigtab2), ], "matrix"))
head(sigtab2)
plotMA(res, main="Difference vs. Average")
legend("bottomright", legend="differentially abundant", lty=-1, pch=1, col="red", bty='n')
# Heatmap of differentially abundant taxa at genus level
library(pheatmap)
##correct way
topN <- 100 ### because this represents 80 % of the taxa present in our samples
sigtab2$abs_log2FC <- abs(sigtab2$log2FoldChange)
# Select top N by absolute log2 fold change
top_genera_tab <- sigtab2[order(-sigtab2$abs_log2FC), ][1:topN, ]
top_taxa_names <- rownames(top_genera_tab)
# Extract Genus names
tax <- tax_table(ps_filtered2)
genus_names <- as.character(tax[top_taxa_names, "Genus"])
genus_names[is.na(genus_names)] <- "Unclassified_Genus"
# Get VST matrix
vsd <- varianceStabilizingTransformation(dds2, blind = FALSE)
vsd_mat <- assay(vsd)
# Subset to top taxa
vsd_top <- vsd_mat[top_taxa_names, ]
rownames(vsd_top) <- genus_names # Set rownames to genus only
# Collapse by genus: sum across rows with same genus, I am not sure if I can do like this or not
vsd_top_collapsed <- rowsum(vsd_top, group = genus_names)
# Optional: z-score across samples
vsd_top_scaled <- t(scale(t(vsd_top_collapsed)))
vsd_top_scaled <- vsd_top_scaled[complete.cases(vsd_top_scaled), ]
# Prepare grouping variable
sample_data_df <- as.data.frame(colData(vsd))
grouping_var <- sample_data_df$FPOM # replace with your grouping variable if needed
# Aggregate mean abundance per genus per group
agg_means <- t(apply(vsd_top_collapsed, 1, function(x) tapply(x, grouping_var, mean)))
# Optional: scale again for better contrast in heatmap
agg_means_scaled <- t(scale(t(agg_means)))
agg_means_scaled <- agg_means_scaled[complete.cases(agg_means_scaled), ]
# Plot heatmap
pheatmap(agg_means_scaled,
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
fontsize_row = 10,
fontsize_col = 10,
angle_col = "0",
main = "",
color = colorRampPalette(c("orange", "gray90", "navyblue"))(100))