Dear all,
I am using Limma for differential expression analysis of proteomics data. I am comparing muscle fiber types from isolated single muscle fibers and although the linear model and the fold changes look alright, the P-values of the protein comparisons look oddly low and the number of signifficant hits is too high (around 1200 out of 1600 proteins are signifficantly different between two groups).
Has anyone experienced something similar while working with Limma? Any suggestions on what could be happening for the P-values to be so weird? In case my n-sizes were too big, any ideas on other tools I could use for differential expression analysis of such a dataset?
Thanks in advance!
Roger
My code:
#My data matrix looks something like this:
data_norm[1:5, 1:5]
#This is my n size:
table(metadata_filtered$Fiber_type)
#First I created my factor vector with 5 levels corresponding to my 5 groups:
vector_factor_fiber_type <- factor(metadata_filtered$Fiber_type, levels = c("Type 1", "Hybrid 1/2A", "Type 2A", "Hybrid 2A/2X", "Type 2X"))
#Then I create a design matrix of 1s and 0s.
matrix_design_fiber_type <- model.matrix(~0+vector_factor_fiber_type)
#Changing the column names to be supported by Limma.
colnames(matrix_design_fiber_type) <- c("Type_1", "Hybrid_1_2A", "Type_2A", "Hybrid_2A_2X", "Type_2X")
#First fit using the design matrix.
fit_fiber_type <- lmFit(object = data_norm,
design = matrix_design_fiber_type)
#I created a contrast matrix with all possible comparisons and defined the levels from my design matrix.
matrix_contrast_fiber_type <- makeContrasts(a = Type_1-Type_2A,
b = Type_2A-Type_2X,
c = Type_1-Type_2X,
d = Type_1-Hybrid_1_2A,
e = Type_1-Hybrid_2A_2X,
f = Hybrid_1_2A-Type_2A,
g = Type_2A-Hybrid_2A_2X,
h = Hybrid_1_2A-Type_2X,
i = Hybrid_2A_2X-Type_2X,
j = Hybrid_1_2A-Hybrid_2A_2X,
levels=matrix_design_fiber_type)
#Second fit using the contrast matrix.
fit2_fiber_type <- contrasts.fit(fit = fit_fiber_type,
contrasts = matrix_contrast_fiber_type)
#Running eBayes on my fit.
fit2_fiber_type <- limma::eBayes(fit2_fiber_type, trend=TRUE, robust=TRUE)
#Using top table to check the top signifficantly different genes from first comparison:
topTable(fit2_fiber_type, coef="a", adjust="BH", number = 20)
#As you see the P-values are completely off and do not make sense, in fact this is the number of signifficantly different proteins using this analysis:
colSums(abs(decideTests(fit2_fiber_type)))
#When I plot the mean of genes between two groups, the linear model seems to be working:
plot(rowMeans(data_norm[, metadata_filtered$Fiber_type == "Type 1"], na.rm = TRUE),
rowMeans(data_norm[, metadata_filtered$Fiber_type == "Type 2A"], na.rm = TRUE))
![Plotting the mean of genes for two groups][1]
sessionInfo( )
Results of running this code:
> #My data matrix looks something like this:
> data_norm[1:5, 1:5]
FOR2_fibNumber90 FOR9_fibNumber118 FOR4_fibNumber31
IGKV3-7 12.30317 NA NA
IGKV3D-11 11.26052 NA NA
ESYT2 11.71254 8.97330 12.14731
ILVBL NA NA 12.68877
FSD2 12.87012 13.35789 12.98480
FOR9_fibNumber200 FOR10_fibNumber77
IGKV3-7 13.883571 12.01044
IGKV3D-11 14.443226 12.86361
ESYT2 11.411228 11.72629
ILVBL 9.452942 12.40145
FSD2 13.630399 13.09411
>
> #This is my n size:
> table(metadata_filtered$Fiber_type)
Hybrid 1/2A Hybrid 2A/2X Type 1 Type 2A Type 2X
78 90 371 469 16
>
> #First I created my factor vector with 5 levels corresponding to my 5 groups:
> vector_factor_fiber_type <- factor(metadata_filtered$Fiber_type, levels = c("Type 1", "Hybrid 1/2A", "Type 2A", "Hybrid 2A/2X", "Type 2X"))
>
> #Then I create a design matrix of 1s and 0s.
> matrix_design_fiber_type <- model.matrix(~0+vector_factor_fiber_type)
>
> #Changing the column names to be supported by Limma.
> colnames(matrix_design_fiber_type) <- c("Type_1", "Hybrid_1_2A", "Type_2A", "Hybrid_2A_2X", "Type_2X")
>
> #First fit using the design matrix.
> fit_fiber_type <- lmFit(object = data_norm,
+ design = matrix_design_fiber_type)
>
> #I created a contrast matrix with all possible comparisons and defined the levels from my design matrix.
> matrix_contrast_fiber_type <- makeContrasts(a = Type_1-Type_2A,
+ b = Type_2A-Type_2X,
+ c = Type_1-Type_2X,
+ d = Type_1-Hybrid_1_2A,
+ e = Type_1-Hybrid_2A_2X,
+ f = Hybrid_1_2A-Type_2A,
+ g = Type_2A-Hybrid_2A_2X,
+ h = Hybrid_1_2A-Type_2X,
+ i = Hybrid_2A_2X-Type_2X,
+ j = Hybrid_1_2A-Hybrid_2A_2X,
+ levels=matrix_design_fiber_type)
>
> #Second fit using the contrast matrix.
> fit2_fiber_type <- contrasts.fit(fit = fit_fiber_type,
+ contrasts = matrix_contrast_fiber_type)
>
> #Running eBayes on my fit.
> fit2_fiber_type <- limma::eBayes(fit2_fiber_type, trend=TRUE, robust=TRUE)
>
> #Using top table to check the top signifficantly different genes from first comparison:
> topTable(fit2_fiber_type, coef="a", adjust="BH", number = 20)
logFC AveExpr t P.Value adj.P.Val B
TPM3 4.410328 17.94829 91.22853 0.000000e+00 0.000000e+00 1124.0425
MYH7 3.317546 21.17579 82.59094 0.000000e+00 0.000000e+00 1034.4054
ATP2A2 4.428647 15.21101 82.35313 0.000000e+00 0.000000e+00 1022.7663
TNNT1 5.624232 16.99821 80.73556 0.000000e+00 0.000000e+00 994.8489
MYH2 -5.568047 20.21235 -77.10580 0.000000e+00 0.000000e+00 972.7754
MYBPC2 -4.446611 17.59789 -67.77104 0.000000e+00 0.000000e+00 838.7414
TNNC1 5.071321 16.58750 64.21913 0.000000e+00 0.000000e+00 799.1449
MYL3 5.657620 19.26718 62.66715 0.000000e+00 0.000000e+00 797.8367
MYLPF -4.523424 18.82686 -62.23455 0.000000e+00 0.000000e+00 791.8598
TNNI1 5.717911 17.93332 61.36695 0.000000e+00 0.000000e+00 774.4269
TNNC2 -4.392147 17.32716 -61.05789 0.000000e+00 0.000000e+00 753.4938
TNNT3 -3.418994 19.83965 -55.74547 2.100587e-312 2.912814e-310 705.6579
TNNI2 -4.616737 17.80521 -56.14805 6.617481e-310 8.470376e-308 699.9382
MYL2 4.551580 18.79123 50.33117 5.560550e-279 6.609111e-277 628.7131
DPYSL3 1.917872 13.39997 50.09703 9.315113e-272 1.033357e-269 612.1222
AK1 -1.317138 19.23314 -48.97063 1.293570e-270 1.345313e-268 609.4496
ATP2A1 -1.630960 19.29170 -46.62393 1.590511e-255 1.556830e-253 574.7092
MYL6B 3.653388 14.78789 46.08069 8.817276e-252 8.151082e-250 566.0914
PHKA1 -1.069066 14.57353 -44.66545 1.015248e-242 8.891437e-241 545.2288
PHKB -1.144642 14.16812 -43.21865 4.667290e-233 3.883185e-231 522.9846
>
> #As you see the P-values are completely off and do not make sense, in fact this is the number of signifficantly different proteins using this analysis:
> colSums(abs(decideTests(fit2_fiber_type)))
a b c d e f g h i j
1210 439 683 701 1122 150 835 383 11 662
>
> #When I plot the mean of genes between two groups, the linear model seems to be working:
> plot(rowMeans(data_norm[, metadata_filtered$Fiber_type == "Type 1"], na.rm = TRUE),
+ rowMeans(data_norm[, metadata_filtered$Fiber_type == "Type 2A"], na.rm = TRUE))
>
>
> sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)
Matrix products: default
locale:
[1] LC_COLLATE=English_United Kingdom.1252
[2] LC_CTYPE=English_United Kingdom.1252
[3] LC_MONETARY=English_United Kingdom.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United Kingdom.1252
attached base packages:
[1] grid stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] RobNorm_0.1.0 VIM_6.1.1 colorspace_2.0-3
[4] clusterProfiler_4.2.2 data.table_1.14.2 forcats_0.5.1
[7] stringr_1.4.0 dplyr_1.0.8 purrr_0.3.4
[10] readr_2.1.2 tidyr_1.2.0 tibble_3.1.6
[13] tidyverse_1.3.1 GO.db_3.14.0 org.Hs.eg.db_3.14.0
[16] AnnotationDbi_1.56.2 IRanges_2.28.0 DT_0.21
[19] plotly_4.10.0 MSnbase_2.20.4 ProtGenerics_1.26.0
[22] S4Vectors_0.32.3 mzR_2.28.0 Rcpp_1.0.8
[25] Biobase_2.54.0 BiocGenerics_0.40.0 PhosR_1.4.0
[28] proBatch_1.10.0 factoextra_1.0.7 ggplot2_3.3.5
[31] umap_0.2.7.0 limma_3.50.1
#More packages loaded but no more space allowed when posting, please ask in case of doubt