I wonder why I get 4 DEGs with EdgeR glm approach and 55 DEGs with DESeq2.
---
title: "EdegR glm approach"
format: html
editor: visual
---
1. Load necessary libraries
{r}
library(edgeR)
library(tidyverse)
library(readxl)
library(writexl)
2. Read the data
{r}
# set the work directory
setwd("C:/Users/catel/Documents/DBT_DEG/Analysis_CC")
# read the phenotype features
data <- read.csv("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/Case_Control_Phenotype_Features_NA_BMI_excluded_cellCounts.csv", header = TRUE)
# group <- factor(data$T2D, levels = c("NO", "YES")) # group is a vector containing the grouping factor information where 'YES' indicates T2D patients and 'NO' indicates non-diabetics
group <- as.factor(data$T2D)
batch <-as.factor(data$batch)
age <-as.numeric(data$Age_exact)
sex <-as.factor(data$Sex)
BMI <-as.numeric(gsub(",",".",data$BMI))
#reading the counts from a file, the counts are stored in a tab-delimited txt file, with gene symbols (ENSGid) in a column called ensembl_gene_id
x <-read.delim("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/case_control_gene_counts_filtered_gene_name_type_chr_NA_BMI_excluded_cellCounts.txt", header = TRUE)
head(x)
length(x)
length(group)
head(group)
print(group)
3. The DGEList data class
{r}
# The main components of an DGEList object are a matrix counts containing the integer counts, a data.frame group containing information about the samples or libraries
y <- DGEList(counts=x,group=group)
head(y)
dim(y)
# The data.frame group contains a column lib.size for the library size or sequencing depth for each sample
4. Normalization
{r}
# Normalize the library sizes
# y <- normLibSizes(y, method = "TMM")
# y$samples
5. Filtration
{r}
# this calculates number of genes that fulfill the conditions of cpm >=1 and >= 80% of samples
keep <- filterByExpr(y, group = group, min.prop = 0.8, min.count = 1)
table(keep)
# Apply filtration, add the logical vectors and sort out everything that is false
y <- y[keep, , keep.lib.sizes=FALSE]
6. Recalculate the effective library size
{r}
#Recalculate library sizes after values are filtered out
y <- normLibSizes(y, method = "TMM")
y$samples
7. Estimating dispersion
{r}
#Estimate the dispersion to model the biological and technical variability in your data
#design <- model.matrix(~group + sex + age + BMI + cd8t + cd4t + nk + bcell + mono + gran)
design <- model.matrix(~ group + batch + age + sex + BMI)
y <- estimateDisp(y, design)
head(y)
dim(y)
8. Testing for DE genes: quasi-likelihood method
{r}
# To perform quasi-likelihood F-tests:
# Fit the model using glmQLFit()
fit <- glmQLFit(y,design)
# To find the coefficients in the model, extract coefficients from the model
# coefficients <- coef(fit)
# The coefficient group is in the second column of the model matrix
# Perform the quasi-likelihood F-test for the coefficient "group"
qlf <- glmQLFTest(fit,coef=2)
# Specify the file path for saving the results CSV file
result_file <- file.path("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/GLM_approach_Age_sex_BMI_cellCounts_filtrerGene", "glmQTFTest_results.csv")
# Save the results to a CSV file
write.csv(as.data.frame(qlf), file = result_file, row.names = FALSE)
9 . FDR
{r}
# Extract the p-values from the results
p_values <- qlf$table$PValue
head(p_values)
# Calculate the FDR values using the Benjamini-Hochberg procedure
FDR_values <- p.adjust(p_values, method = "BH")
# Create a data frame with the FDR values
FDR_results <- data.frame(FDR = FDR_values)
# Combine the FDR results with all other results
all_results_with_FDR <- cbind(qlf, FDR_results)
# Save the combined results to a CSV file
result_file <- file.path("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/GLM_approach_Age_sex_BMI_cellCounts_filtrerGene", "glmQLFTest_results_with_FDR.csv")
write.csv(all_results_with_FDR, file = result_file, row.names = FALSE)
# Specify the file path for saving the FDR results CSV file
FDR_file <- file.path("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/GLM_approach_Age_sex_BMI_cellCounts_filtrerGene", "FDR_results.csv")
# Save the FDR results to a CSV file
write.csv(FDR_results, file = FDR_file, row.names = FALSE)
---
title: "DeSEQ2"
format: html
editor: visual
---
from the manual: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
Library
Load necessary libraries
{r}
library(tidyr)
library(DESeq2)
library(readxl)
library(writexl)
library(edgeR)
The datasets
{r}
coldata <- read.csv("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/Case_Control_Phenotype_Features_NA_BMI_excluded_cellCounts.csv", header = TRUE)
# I removed the ENSGids that resulted to be replicates when the ENSGid is "redone" without the numbers and letters after the '.'. I removed 33 ENSGid and their counts where 0. In the original gene counts file for cc there are the removed ENSGids.
Extract ENSG IDs
{r}
# Read my count data file
gene_counts <- read.csv("C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Data/case_control_gene_counts_filtered_gene_name_type_chr_BMI_NA_excluded_cellCounts_removed_duplicates.csv", header = TRUE)
# Extract the ENSG IDs before the dot and remove any digits after the dot
gene_counts$Geneid <- gsub("\\..*", "", gene_counts$Geneid)
dim(gene_counts)
# Print the modified dataframe
print(gene_counts)
Variables as factors
{r}
# Setting the factor levels
T2D <- as.factor(coldata$T2D)
batch <- as.factor(coldata$batch)
Age <- as.numeric(coldata$Age_exact)
Gender <- as.numeric(coldata$Gender) #Sex
Equal number of columns between datasets
Match the number of columns between cts and coldata
{r}
# Convert the data.frame to a matrix, with gene IDs as row names:
# DESeqDataSetFromMatrix function can only handle numeric values so first column must be removed
cts <- as.matrix(gene_counts[, -1]) # Remove the first column (Geneid's) before conversion
# Set the gene IDs as row names, so coldata and cts have the same number of column
rownames(cts) <- gene_counts$Geneid
# Check dimensions
head(cts)
dim(cts)
dim(coldata)
# Find duplicate row names
# dup_rows <- rownames(cts)[duplicated(rownames(cts))]
# Print duplicate row names
# print(dup_rows)
Factor Level
{r}
# Setting the factor levels
coldata$T2D <- factor(coldata$T2D, levels = c("NO","YES"))
The DESeqDataSet
{r}
# Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design= ~ batch + Gender + Age_exact + T2D)
# Check DESeq2 dataset
head(dds)
Pre-filtration
{r}
keep <- filterByExpr(dds, group = T2D, min.prop = 0.8, min.count = 1)
dds <- dds[keep,]
dim(dds)
DESeq analysis
{r}
# Run differential expression analysis
dds <- DESeq(dds)
# Message: 1 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
# Run differential expression analysis with increased maxit
# results <- nbinomWaldTest(dds, maxit = 1000)
#same results as running the script without nbinomWaldTest
Results of differential expression testing
{r}
# Define the directory where you want to save the results
results_dir <- "C:/Users/catel/Documents/DBT_DEG/Analysis_CC/Results/DESeq2/"
# Extract results of differential expression testing
deseq_results <- results(dds)
# Define the file path for saving the results CSV file
results_file <- paste0(results_dir, "deseq2_results.csv")
# Write the results to a CSV file
write.csv(as.data.frame(deseq_results), file = results_file)
# Print a message indicating successful saving
cat("DESeq2 results saved to:", results_file, "\n")
Because the two packages use different tests to determine significance. The quasi-likelihood F-test that you are using with
edgeR
is more conservative than the Wald test you are using withDGESeq2
.Agree with James
There are many posts on this topic on this site.