Question

Correlation of gene expression values using tpms and normalized counts

2

Entering edit mode

taliaferrojm ▴ 20

@taliaferrojm-20819

Last seen 4.9 years ago

Hi Mike,

I have two RNA seq samples. I quantified them both using salmon. I then want log2 fold change in expression at the gene level (forget p values and replicates for now). I compiled gene-level expression values using tximport. However, if I take the ratio of tpm values, I get a different result than if I take the ratio of DEseq-calculated normalized counts. The two ratios are perfectly correlated, but the tpm ratios are offset from 0 in a way that doesn’t make biological sense to me (see attached plot). Is there some kind of normalization that happens for tpm values that is causing this?

I'm sure I'm missing something here that is very obvious. Thanks for your help.

library(tidyverse)
library(DESeq2)
library(tximport)
library(biomaRt)


#Get tx/gene relationships
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host='www.ensembl.org')
t2g <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
t2g <- dplyr::rename(t2g, target_id= ensembl_transcript_id, ext_gene = external_gene_name)
ens2gene <- t2g[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)

#tximport
base_dir <- '~/Documents/MIT/Localization/Fmr1Deletion/Fmr1Rescue/salmon'

sample_id <- c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3', 
               'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3', 
               'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3', 
               'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3', 
               'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3', 
               'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3', 
               'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3', 
               'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3')
salm_dirs <- sapply(sample_id, function(id) file.path(base_dir, id, 'quant.sf'))
tx2gene <- t2g[,c(1,2)]
colnames(tx2gene) <- c('TXNAME', 'GENEID')

txi <- tximport(salm_dirs, type = 'salmon', tx2gene = tx2gene, dropInfReps = TRUE, countsFromAbundance = 'lengthScaledTPM')
tpms <- data.frame(txi$abundance)


#DESeq
samples <- data.frame(row.names = c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3', 
                                    'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3', 
                                    'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3', 
                                    'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3', 
                                    'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3', 
                                    'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3', 
                                    'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3', 
                                    'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3'),
                      compartment = c(rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3)),
                      genotype = c(rep('FLrescue', 6), rep('GFPrescue', 6), rep('RGGrescue', 6), rep('WT', 6)))

ddsTxi <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ compartment + genotype + compartment:genotype)
dds <- DESeq(ddsTxi)
normcounts <- counts(dds, normalized = TRUE)


#Compare expression of CAD_FMR1KO_FMR1_neurite_rep1 to CAD_FMR1KO_FMR1_soma_rep1 using both tpms and normalized counts
t.tpms <- tpms %>%
  as.data.frame() %>%
  rownames_to_column(., var = 'ensembl_gene_id') %>%
  rowwise() %>%
  mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3, 
                                    CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3, 
                                    CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3, 
                                    CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3, 
                                    CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3, 
                                    CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3, 
                                    CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3, 
                                    CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
  filter(., minexp >= 10) %>%
  mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
  dplyr::select(., c('ensembl_gene_id', 'ratio'))

t.normcounts <- normcounts %>%
  as.data.frame() %>%
  rownames_to_column(., var = 'ensembl_gene_id') %>%
  rowwise() %>%
  mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3, 
                                    CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3, 
                                    CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3, 
                                    CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3, 
                                    CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3, 
                                    CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3, 
                                    CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3, 
                                    CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
  filter(., minexp >= 100) %>%
  mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
  dplyr::select(., c('ensembl_gene_id', 'ratio'))

t <- inner_join(t.tpms, t.normcounts, by = 'ensembl_gene_id', suffix = c('.tpms', '.normcounts'))

enter image description here

tximport deseq2 • 1.1k views

ADD COMMENT • link updated 4.9 years ago by Michael Love 41k • written 4.9 years ago by taliaferrojm ▴ 20

score 2 · Accepted Answer · 2019-05-17

2

Entering edit mode

Michael Love 41k

@mikelove

Last seen 2 hours ago

United States

The TPM values are constrained such that the sum is constant. The normalized counts however do not have such a constraint. Instead the median ratio normalization attempts to center the distribution of log ratios on 0 (minimizing differential expression). Depending on the goals, robust normalization may make sense for your task.

ADD COMMENT • link 4.9 years ago Michael Love 41k