Correlation of gene expression values using tpms and normalized counts
1
2
Entering edit mode
taliaferrojm ▴ 20
@taliaferrojm-20819
Last seen 5.7 years ago

Hi Mike,

I have two RNA seq samples. I quantified them both using salmon. I then want log2 fold change in expression at the gene level (forget p values and replicates for now). I compiled gene-level expression values using tximport. However, if I take the ratio of tpm values, I get a different result than if I take the ratio of DEseq-calculated normalized counts. The two ratios are perfectly correlated, but the tpm ratios are offset from 0 in a way that doesn’t make biological sense to me (see attached plot). Is there some kind of normalization that happens for tpm values that is causing this?

I'm sure I'm missing something here that is very obvious. Thanks for your help.

library(tidyverse)
library(DESeq2)
library(tximport)
library(biomaRt)


#Get tx/gene relationships
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host='www.ensembl.org')
t2g <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
t2g <- dplyr::rename(t2g, target_id= ensembl_transcript_id, ext_gene = external_gene_name)
ens2gene <- t2g[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)

#tximport
base_dir <- '~/Documents/MIT/Localization/Fmr1Deletion/Fmr1Rescue/salmon'

sample_id <- c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3', 
               'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3', 
               'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3', 
               'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3', 
               'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3', 
               'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3', 
               'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3', 
               'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3')
salm_dirs <- sapply(sample_id, function(id) file.path(base_dir, id, 'quant.sf'))
tx2gene <- t2g[,c(1,2)]
colnames(tx2gene) <- c('TXNAME', 'GENEID')

txi <- tximport(salm_dirs, type = 'salmon', tx2gene = tx2gene, dropInfReps = TRUE, countsFromAbundance = 'lengthScaledTPM')
tpms <- data.frame(txi$abundance)


#DESeq
samples <- data.frame(row.names = c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3', 
                                    'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3', 
                                    'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3', 
                                    'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3', 
                                    'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3', 
                                    'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3', 
                                    'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3', 
                                    'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3'),
                      compartment = c(rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3)),
                      genotype = c(rep('FLrescue', 6), rep('GFPrescue', 6), rep('RGGrescue', 6), rep('WT', 6)))

ddsTxi <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ compartment + genotype + compartment:genotype)
dds <- DESeq(ddsTxi)
normcounts <- counts(dds, normalized = TRUE)


#Compare expression of CAD_FMR1KO_FMR1_neurite_rep1 to CAD_FMR1KO_FMR1_soma_rep1 using both tpms and normalized counts
t.tpms <- tpms %>%
  as.data.frame() %>%
  rownames_to_column(., var = 'ensembl_gene_id') %>%
  rowwise() %>%
  mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3, 
                                    CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3, 
                                    CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3, 
                                    CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3, 
                                    CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3, 
                                    CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3, 
                                    CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3, 
                                    CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
  filter(., minexp >= 10) %>%
  mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
  dplyr::select(., c('ensembl_gene_id', 'ratio'))

t.normcounts <- normcounts %>%
  as.data.frame() %>%
  rownames_to_column(., var = 'ensembl_gene_id') %>%
  rowwise() %>%
  mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3, 
                                    CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3, 
                                    CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3, 
                                    CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3, 
                                    CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3, 
                                    CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3, 
                                    CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3, 
                                    CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
  filter(., minexp >= 100) %>%
  mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
  dplyr::select(., c('ensembl_gene_id', 'ratio'))

t <- inner_join(t.tpms, t.normcounts, by = 'ensembl_gene_id', suffix = c('.tpms', '.normcounts'))

enter image description here

tximport deseq2 • 1.3k views
ADD COMMENT
2
Entering edit mode
@mikelove
Last seen 1 day ago
United States

The TPM values are constrained such that the sum is constant. The normalized counts however do not have such a constraint. Instead the median ratio normalization attempts to center the distribution of log ratios on 0 (minimizing differential expression). Depending on the goals, robust normalization may make sense for your task.

ADD COMMENT

Login before adding your answer.

Traffic: 572 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6