Hi Mike,
I have two RNA seq samples. I quantified them both using salmon. I then want log2 fold change in expression at the gene level (forget p values and replicates for now). I compiled gene-level expression values using tximport. However, if I take the ratio of tpm values, I get a different result than if I take the ratio of DEseq-calculated normalized counts. The two ratios are perfectly correlated, but the tpm ratios are offset from 0 in a way that doesn’t make biological sense to me (see attached plot). Is there some kind of normalization that happens for tpm values that is causing this?
I'm sure I'm missing something here that is very obvious. Thanks for your help.
library(tidyverse)
library(DESeq2)
library(tximport)
library(biomaRt)
#Get tx/gene relationships
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host='www.ensembl.org')
t2g <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
t2g <- dplyr::rename(t2g, target_id= ensembl_transcript_id, ext_gene = external_gene_name)
ens2gene <- t2g[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)
#tximport
base_dir <- '~/Documents/MIT/Localization/Fmr1Deletion/Fmr1Rescue/salmon'
sample_id <- c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3',
'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3',
'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3',
'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3',
'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3',
'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3',
'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3',
'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3')
salm_dirs <- sapply(sample_id, function(id) file.path(base_dir, id, 'quant.sf'))
tx2gene <- t2g[,c(1,2)]
colnames(tx2gene) <- c('TXNAME', 'GENEID')
txi <- tximport(salm_dirs, type = 'salmon', tx2gene = tx2gene, dropInfReps = TRUE, countsFromAbundance = 'lengthScaledTPM')
tpms <- data.frame(txi$abundance)
#DESeq
samples <- data.frame(row.names = c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3',
'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3',
'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3',
'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3',
'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3',
'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3',
'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3',
'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3'),
compartment = c(rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3)),
genotype = c(rep('FLrescue', 6), rep('GFPrescue', 6), rep('RGGrescue', 6), rep('WT', 6)))
ddsTxi <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ compartment + genotype + compartment:genotype)
dds <- DESeq(ddsTxi)
normcounts <- counts(dds, normalized = TRUE)
#Compare expression of CAD_FMR1KO_FMR1_neurite_rep1 to CAD_FMR1KO_FMR1_soma_rep1 using both tpms and normalized counts
t.tpms <- tpms %>%
as.data.frame() %>%
rownames_to_column(., var = 'ensembl_gene_id') %>%
rowwise() %>%
mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3,
CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3,
CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3,
CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3,
CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3,
CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3,
CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3,
CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
filter(., minexp >= 10) %>%
mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
dplyr::select(., c('ensembl_gene_id', 'ratio'))
t.normcounts <- normcounts %>%
as.data.frame() %>%
rownames_to_column(., var = 'ensembl_gene_id') %>%
rowwise() %>%
mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3,
CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3,
CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3,
CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3,
CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3,
CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3,
CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3,
CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
filter(., minexp >= 100) %>%
mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
dplyr::select(., c('ensembl_gene_id', 'ratio'))
t <- inner_join(t.tpms, t.normcounts, by = 'ensembl_gene_id', suffix = c('.tpms', '.normcounts'))