I wonder why I get 4 DEGs with EdgeR glm approach and 55 DEGs with DESeq2.
---
title: "EdegR glm approach"
format: html
editor: visual
---
1. Load necessary libraries
{r}
library(edgeR)
library(tidyverse)
library(readxl)
library(writexl)
2. Read the data
{r}
# set the work directory
setwd("C:/Users/catel/Documents/DBT_DEG/Analysis_CC")
# read the phenotype features
data <- read.csv("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/Case_Control_Phenotype_Features_NA_BMI_excluded_cellCounts.csv", header = TRUE)
# group <- factor(data$T2D, levels = c("NO", "YES")) # group is a vector containing the grouping factor information where 'YES' indicates T2D patients and 'NO' indicates non-diabetics
group <- as.factor(data$T2D)
batch <-as.factor(data$batch)
age <-as.numeric(data$Age_exact)
sex <-as.factor(data$Sex)
BMI <-as.numeric(gsub(",",".",data$BMI))
#reading the counts from a file, the counts are stored in a tab-delimited txt file, with gene symbols (ENSGid) in a column called ensembl_gene_id
x <-read.delim("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/case_control_gene_counts_filtered_gene_name_type_chr_NA_BMI_excluded_cellCounts.txt", header = TRUE)
head(x)
length(x)
length(group)
head(group)
print(group)
3. The DGEList data class
{r}
# The main components of an DGEList object are a matrix counts containing the integer counts, a data.frame group containing information about the samples or libraries
y <- DGEList(counts=x,group=group)
head(y)
dim(y)
# The data.frame group contains a column lib.size for the library size or sequencing depth for each sample
4. Normalization
{r}
# Normalize the library sizes
# y <- normLibSizes(y, method = "TMM")
# y$samples
5. Filtration
{r}
# this calculates number of genes that fulfill the conditions of cpm >=1 and >= 80% of samples
keep <- filterByExpr(y, group = group, min.prop = 0.8, min.count = 1)
table(keep)
# Apply filtration, add the logical vectors and sort out everything that is false
y <- y[keep, , keep.lib.sizes=FALSE]
6. Recalculate the effective library size
{r}
#Recalculate library sizes after values are filtered out
y <- normLibSizes(y, method = "TMM")
y$samples
7. Estimating dispersion
{r}
#Estimate the dispersion to model the biological and technical variability in your data
#design <- model.matrix(~group + sex + age + BMI + cd8t + cd4t + nk + bcell + mono + gran)
design <- model.matrix(~ group + batch + age + sex + BMI)
y <- estimateDisp(y, design)
head(y)
dim(y)
8. Testing for DE genes: quasi-likelihood method
{r}
# To perform quasi-likelihood F-tests:
# Fit the model using glmQLFit()
fit <- glmQLFit(y,design)
# To find the coefficients in the model, extract coefficients from the model
# coefficients <- coef(fit)
# The coefficient group is in the second column of the model matrix
# Perform the quasi-likelihood F-test for the coefficient "group"
qlf <- glmQLFTest(fit,coef=2)
# Specify the file path for saving the results CSV file
result_file <- file.path("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/GLM_approach_Age_sex_BMI_cellCounts_filtrerGene", "glmQTFTest_results.csv")
# Save the results to a CSV file
write.csv(as.data.frame(qlf), file = result_file, row.names = FALSE)
9 . FDR
{r}
# Extract the p-values from the results
p_values <- qlf$table$PValue
head(p_values)
# Calculate the FDR values using the Benjamini-Hochberg procedure
FDR_values <- p.adjust(p_values, method = "BH")
# Create a data frame with the FDR values
FDR_results <- data.frame(FDR = FDR_values)
# Combine the FDR results with all other results
all_results_with_FDR <- cbind(qlf, FDR_results)
# Save the combined results to a CSV file
result_file <- file.path("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/GLM_approach_Age_sex_BMI_cellCounts_filtrerGene", "glmQLFTest_results_with_FDR.csv")
write.csv(all_results_with_FDR, file = result_file, row.names = FALSE)
# Specify the file path for saving the FDR results CSV file
FDR_file <- file.path("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/GLM_approach_Age_sex_BMI_cellCounts_filtrerGene", "FDR_results.csv")
# Save the FDR results to a CSV file
write.csv(FDR_results, file = FDR_file, row.names = FALSE)
---
title: "DeSEQ2"
format: html
editor: visual
---
from the manual: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
Library
Load necessary libraries
{r}
library(tidyr)
library(DESeq2)
library(readxl)
library(writexl)
library(edgeR)
The datasets
{r}
coldata <- read.csv("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/Case_Control_Phenotype_Features_NA_BMI_excluded_cellCounts.csv", header = TRUE)
# I removed the ENSGids that resulted to be replicates when the ENSGid is "redone" without the numbers and letters after the '.'. I removed 33 ENSGid and their counts where 0. In the original gene counts file for cc there are the removed ENSGids.
Extract ENSG IDs
{r}
# Read my count data file
gene_counts <- read.csv("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/case_control_gene_counts_filtered_gene_name_type_chr_BMI_NA_excluded_cellCounts_removed_duplicates.csv", header = TRUE)
# Extract the ENSG IDs before the dot and remove any digits after the dot
gene_counts$Geneid <- gsub("\\..*", "", gene_counts$Geneid)
dim(gene_counts)
# Print the modified dataframe
print(gene_counts)
Variables as factors
{r}
# Setting the factor levels
T2D <- as.factor(coldata$T2D)
batch <- as.factor(coldata$batch)
Age <- as.numeric(coldata$Age_exact)
Gender <- as.numeric(coldata$Gender) #Sex
Equal number of columns between datasets
Match the number of columns between cts and coldata
{r}
# Convert the data.frame to a matrix, with gene IDs as row names:
# DESeqDataSetFromMatrix function can only handle numeric values so first column must be removed
cts <- as.matrix(gene_counts[, -1]) # Remove the first column (Geneid's) before conversion
# Set the gene IDs as row names, so coldata and cts have the same number of column
rownames(cts) <- gene_counts$Geneid
# Check dimensions
head(cts)
dim(cts)
dim(coldata)
# Find duplicate row names
# dup_rows <- rownames(cts)[duplicated(rownames(cts))]
# Print duplicate row names
# print(dup_rows)
Factor Level
{r}
# Setting the factor levels
coldata$T2D <- factor(coldata$T2D, levels = c("NO","YES"))
The DESeqDataSet
{r}
# Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design= ~ batch + Gender + Age_exact + T2D)
# Check DESeq2 dataset
head(dds)
Pre-filtration
{r}
keep <- filterByExpr(dds, group = T2D, min.prop = 0.8, min.count = 1)
dds <- dds[keep,]
dim(dds)
DESeq analysis
{r}
# Run differential expression analysis
dds <- DESeq(dds)
# Message: 1 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
# Run differential expression analysis with increased maxit
# results <- nbinomWaldTest(dds, maxit = 1000)
#same results as running the script without nbinomWaldTest
Results of differential expression testing
{r}
# Define the directory where you want to save the results
results_dir <- "C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/DESeq2/"
# Extract results of differential expression testing
deseq_results <- results(dds)
# Define the file path for saving the results CSV file
results_file <- paste0(results_dir, "deseq2_results.csv")
# Write the results to a CSV file
write.csv(as.data.frame(deseq_results), file = results_file)
# Print a message indicating successful saving
cat("DESeq2 results saved to:", results_file, "\n")
Because the two packages use different tests to determine significance. The quasi-likelihood F-test that you are using with
edgeRis more conservative than the Wald test you are using withDGESeq2.Agree with James
There are many posts on this topic on this site.