Question: Correlation of gene expression values using tpms and normalized counts
0
gravatar for taliaferrojm
7 months ago by
taliaferrojm0 wrote:

Hi Mike,

I have two RNA seq samples. I quantified them both using salmon. I then want log2 fold change in expression at the gene level (forget p values and replicates for now). I compiled gene-level expression values using tximport. However, if I take the ratio of tpm values, I get a different result than if I take the ratio of DEseq-calculated normalized counts. The two ratios are perfectly correlated, but the tpm ratios are offset from 0 in a way that doesn’t make biological sense to me (see attached plot). Is there some kind of normalization that happens for tpm values that is causing this?

I'm sure I'm missing something here that is very obvious. Thanks for your help.

library(tidyverse)
library(DESeq2)
library(tximport)
library(biomaRt)


#Get tx/gene relationships
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host='www.ensembl.org')
t2g <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
t2g <- dplyr::rename(t2g, target_id= ensembl_transcript_id, ext_gene = external_gene_name)
ens2gene <- t2g[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)

#tximport
base_dir <- '~/Documents/MIT/Localization/Fmr1Deletion/Fmr1Rescue/salmon'

sample_id <- c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3', 
               'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3', 
               'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3', 
               'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3', 
               'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3', 
               'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3', 
               'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3', 
               'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3')
salm_dirs <- sapply(sample_id, function(id) file.path(base_dir, id, 'quant.sf'))
tx2gene <- t2g[,c(1,2)]
colnames(tx2gene) <- c('TXNAME', 'GENEID')

txi <- tximport(salm_dirs, type = 'salmon', tx2gene = tx2gene, dropInfReps = TRUE, countsFromAbundance = 'lengthScaledTPM')
tpms <- data.frame(txi$abundance)


#DESeq
samples <- data.frame(row.names = c('CAD_FMR1KO_FMR1_neurite_rep1', 'CAD_FMR1KO_FMR1_neurite_rep2', 'CAD_FMR1KO_FMR1_neurite_rep3', 
                                    'CAD_FMR1KO_FMR1_soma_rep1', 'CAD_FMR1KO_FMR1_soma_rep2', 'CAD_FMR1KO_FMR1_soma_rep3', 
                                    'CAD_FMR1KO_GFP_neurite_rep1', 'CAD_FMR1KO_GFP_neurite_rep2', 'CAD_FMR1KO_GFP_neurite_rep3', 
                                    'CAD_FMR1KO_GFP_soma_rep1', 'CAD_FMR1KO_GFP_soma_rep2', 'CAD_FMR1KO_GFP_soma_rep3', 
                                    'CAD_FMR1KO_RGG_neurite_rep1', 'CAD_FMR1KO_RGG_neurite_rep2', 'CAD_FMR1KO_RGG_neurite_rep3', 
                                    'CAD_FMR1KO_RGG_soma_rep1', 'CAD_FMR1KO_RGG_soma_rep2', 'CAD_FMR1KO_RGG_soma_rep3', 
                                    'CAD_LoxP_GFP_neurite_rep1', 'CAD_LoxP_GFP_neurite_rep2', 'CAD_LoxP_GFP_neurite_rep3', 
                                    'CAD_LoxP_GFP_soma_rep1', 'CAD_LoxP_GFP_soma_rep2', 'CAD_LoxP_GFP_soma_rep3'),
                      compartment = c(rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3), rep('Neurite', 3), rep('Soma', 3)),
                      genotype = c(rep('FLrescue', 6), rep('GFPrescue', 6), rep('RGGrescue', 6), rep('WT', 6)))

ddsTxi <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ compartment + genotype + compartment:genotype)
dds <- DESeq(ddsTxi)
normcounts <- counts(dds, normalized = TRUE)


#Compare expression of CAD_FMR1KO_FMR1_neurite_rep1 to CAD_FMR1KO_FMR1_soma_rep1 using both tpms and normalized counts
t.tpms <- tpms %>%
  as.data.frame() %>%
  rownames_to_column(., var = 'ensembl_gene_id') %>%
  rowwise() %>%
  mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3, 
                                    CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3, 
                                    CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3, 
                                    CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3, 
                                    CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3, 
                                    CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3, 
                                    CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3, 
                                    CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
  filter(., minexp >= 10) %>%
  mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
  dplyr::select(., c('ensembl_gene_id', 'ratio'))

t.normcounts <- normcounts %>%
  as.data.frame() %>%
  rownames_to_column(., var = 'ensembl_gene_id') %>%
  rowwise() %>%
  mutate(., minexp = min(CAD_FMR1KO_FMR1_neurite_rep1, CAD_FMR1KO_FMR1_neurite_rep2, CAD_FMR1KO_FMR1_neurite_rep3, 
                                    CAD_FMR1KO_FMR1_soma_rep1, CAD_FMR1KO_FMR1_soma_rep2, CAD_FMR1KO_FMR1_soma_rep3, 
                                    CAD_FMR1KO_GFP_neurite_rep1, CAD_FMR1KO_GFP_neurite_rep2, CAD_FMR1KO_GFP_neurite_rep3, 
                                    CAD_FMR1KO_GFP_soma_rep1, CAD_FMR1KO_GFP_soma_rep2, CAD_FMR1KO_GFP_soma_rep3, 
                                    CAD_FMR1KO_RGG_neurite_rep1, CAD_FMR1KO_RGG_neurite_rep2, CAD_FMR1KO_RGG_neurite_rep3, 
                                    CAD_FMR1KO_RGG_soma_rep1, CAD_FMR1KO_RGG_soma_rep2, CAD_FMR1KO_RGG_soma_rep3, 
                                    CAD_LoxP_GFP_neurite_rep1, CAD_LoxP_GFP_neurite_rep2, CAD_LoxP_GFP_neurite_rep3, 
                                    CAD_LoxP_GFP_soma_rep1, CAD_LoxP_GFP_soma_rep2, CAD_LoxP_GFP_soma_rep3)) %>%
  filter(., minexp >= 100) %>%
  mutate(., ratio = log2(CAD_FMR1KO_FMR1_neurite_rep1 / CAD_FMR1KO_FMR1_soma_rep1)) %>%
  dplyr::select(., c('ensembl_gene_id', 'ratio'))

t <- inner_join(t.tpms, t.normcounts, by = 'ensembl_gene_id', suffix = c('.tpms', '.normcounts'))

enter image description here

deseq2 tximport • 222 views
ADD COMMENTlink modified 7 months ago by Michael Love26k • written 7 months ago by taliaferrojm0
Answer: Correlation of gene expression values using tpms and normalized counts
1
gravatar for Michael Love
7 months ago by
Michael Love26k
United States
Michael Love26k wrote:

The TPM values are constrained such that the sum is constant. The normalized counts however do not have such a constraint. Instead the median ratio normalization attempts to center the distribution of log ratios on 0 (minimizing differential expression). Depending on the goals, robust normalization may make sense for your task.

ADD COMMENTlink modified 7 months ago • written 7 months ago by Michael Love26k
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 16.09
Traffic: 379 users visited in the last hour