The compare result is different in score and Conc in DiffBind
0
Entering edit mode
@shangguandong1996-21805
Last seen 26 days ago

Hi, Dr Stark

I have a question about the score argument in dba.count function, I know the score will not be used in the final diff result, it is only be used in plot. But I find there are many peaks that whose realtive size relationship of treat and control is different between Conc(which is showed in final dba.report) and score. Let's make a example

library(DiffBind)
library(BiocParallel)

library(dplyr)

tamoxifen <- dba(sampleSheet="tamoxifen.csv")
tamoxifen <- dba.count(tamoxifen, summits=250)


# get compare(Conc) result ------------------------------------------------------

compare_function <- function(group1, group2){
  dba_contrast <- dba.contrast(tamoxifen, 
                               group1 = tamoxifen$masks[[group1]],
                               group2 = tamoxifen$masks[[group2]],
                               name1 = group1,
                               name2 = group2)

  dba_analyze <- dba.analyze(dba_contrast)

  dba_report_all <- dba.report(dba_analyze, 
                               th = 1, 
                               precision = 0)
  dba_report_all$feature_id <- paste0("peak", names(dba_report_all))

  return(dba_report_all)
}


compare_1 <- compare_function(group1 = "BT474", 
                              group2 = "MCF7")


# get score result --------------------------------------------------------

peakScore_list <- tamoxifen$peaks
names(peakScore_list) <- tamoxifen$samples$SampleID

peakScore <- lapply(peakScore_list, function(x) {x$Score})
peakScore_mt <- do.call(cbind, peakScore)
rownames(peakScore_mt) <- paste0("peak", 1:dim(peakScore_mt)[1])

peakScore_mt[, c("BT4741", "MCF71")] + 
  peakScore_mt[, c("BT4742", "MCF72")] -> mergeData



# tidy --------------------------------------------------------------------

mergeData %>% 
  as_tibble(rownames = "feature_id") %>% 
  mutate(scoreComapre = (BT4741 > MCF71)) -> scoreCompare

compare_1 %>% 
  as_tibble() %>% 
  select(Conc_BT474, Conc_MCF7, Fold, feature_id) %>% 
  mutate(concCompare = (Fold > 0)) -> concCompare

inner_join(scoreCompare, concCompare) %>% 
  filter(scoreComapre != concCompare) %>% 
  pull(feature_id) -> paradox_peak

> length(paradox_peak)
[1] 258

And there are some apparent contradictory relative relationships. like

# peak1080 in BT474 is stronger than MCF7 
# however, in Conc compare, Conc_BT474 is smaller than Conc_MCF7
> inner_join(scoreCompare, concCompare) %>% 
+   filter(scoreComapre != concCompare) %>% 
+   arrange(Fold)
Joining, by = "feature_id"
# A tibble: 258 x 8
   feature_id BT4741 MCF71 scoreComapre Conc_BT474 Conc_MCF7  Fold concCompare
   <chr>       <dbl> <dbl> <lgl>             <dbl>     <dbl> <dbl> <lgl>      
 1 peak1080     5.31  4.33 TRUE             0.0686      5.08 -5.01 FALSE      
 2 peak213      8.12  4.33 TRUE             0.659       4.19 -3.53 FALSE      
 3 peak1061     9.42  4.33 TRUE             0.880       3.77 -2.89 FALSE      
 4 peak1369    14.8  11.0  TRUE             1.53        4.15 -2.62 FALSE      
 5 peak1530    13.4  13.3  TRUE             1.39        4.00 -2.61 FALSE      
 6 peak1081    21.6   4.33 TRUE             2.07        4.59 -2.52 FALSE      
 7 peak710     10.5   4.33 TRUE             1.07        3.44 -2.38 FALSE      
 8 peak902     21.2   4.33 TRUE             2.07        4.36 -2.30 FALSE      
 9 peak869      9.64  4.33 TRUE             0.887       3.11 -2.22 FALSE      
10 peak588     10.9   4.33 TRUE             1.08        3.29 -2.22 FALSE      
# … with 248 more rows
> sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS:   /opt/sysoft/R-3.6.1/lib64/R/lib/libRblas.so
LAPACK: /opt/sysoft/R-3.6.1/lib64/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
 [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
[10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods  
[9] base     

other attached packages:
 [1] dplyr_1.0.2                 DiffBind_2.14.0            
 [3] SummarizedExperiment_1.16.0 DelayedArray_0.12.0        
 [5] BiocParallel_1.19.6         matrixStats_0.55.0         
 [7] Biobase_2.46.0              GenomicRanges_1.38.0       
 [9] GenomeInfoDb_1.22.0         IRanges_2.20.0             
[11] S4Vectors_0.24.0            BiocGenerics_0.32.0        

loaded via a namespace (and not attached):
  [1] amap_0.8-18              colorspace_1.4-1         rjson_0.2.20            
  [4] hwriter_1.3.2            ellipsis_0.3.0           htmlTable_1.13.3        
  [7] XVector_0.26.0           base64enc_0.1-3          rstudioapi_0.10         
 [10] ggrepel_0.8.1            bit64_0.9-7              fansi_0.4.0             
 [13] AnnotationDbi_1.48.0     splines_3.6.1            geneplotter_1.64.0      
 [16] knitr_1.26               Formula_1.2-3            Rsamtools_2.2.3         
 [19] packrat_0.5.0            annotate_1.64.0          cluster_2.1.0           
 [22] GO.db_3.10.0             dbplyr_1.4.2             png_0.1-7               
 [25] pheatmap_1.0.12          graph_1.64.0             compiler_3.6.1          
 [28] httr_1.4.1               GOstats_2.52.0           backports_1.1.5         
 [31] assertthat_0.2.1         Matrix_1.2-18            cli_2.0.0               
 [34] limma_3.42.0             htmltools_0.4.0          acepack_1.4.1           
 [37] prettyunits_1.0.2        tools_3.6.1              gtable_0.3.0            
 [40] glue_1.4.1               GenomeInfoDbData_1.2.2   Category_2.52.0         
 [43] systemPipeR_1.20.0       batchtools_0.9.11        rappdirs_0.3.1          
 [46] ShortRead_1.44.0         Rcpp_1.0.3               vctrs_0.3.2             
 [49] Biostrings_2.54.0        gdata_2.18.0             rtracklayer_1.46.0      
 [52] xfun_0.19                stringr_1.4.0            lifecycle_0.2.0         
 [55] gtools_3.8.1             XML_3.98-1.20            edgeR_3.28.0            
 [58] zlibbioc_1.32.0          scales_1.1.0             BSgenome_1.54.0         
 [61] VariantAnnotation_1.32.0 hms_0.5.2                RBGL_1.62.0             
 [64] RColorBrewer_1.1-2       yaml_2.2.0               curl_4.3                
 [67] gridExtra_2.3            memoise_1.1.0            ggplot2_3.3.2           
 [70] rpart_4.1-15             biomaRt_2.42.0           latticeExtra_0.6-29     
 [73] stringi_1.4.3            RSQLite_2.1.5            genefilter_1.68.0       
 [76] checkmate_1.9.4          GenomicFeatures_1.38.0   caTools_1.17.1.3        
 [79] rlang_0.4.7              pkgconfig_2.0.3          bitops_1.0-6            
 [82] lattice_0.20-38          purrr_0.3.3              htmlwidgets_1.5.1       
 [85] GenomicAlignments_1.22.0 bit_1.1-14               tidyselect_1.1.0        
 [88] GSEABase_1.48.0          AnnotationForge_1.28.0   magrittr_1.5            
 [91] DESeq2_1.26.0            R6_2.4.1                 gplots_3.0.1.1          
 [94] generics_0.0.2           Hmisc_4.3-0              base64url_1.4           
 [97] DBI_1.1.0                foreign_0.8-73           pillar_1.4.3            
[100] withr_2.1.2              nnet_7.3-12              survival_3.1-8          
[103] RCurl_1.95-4.12          tibble_2.1.3             crayon_1.3.4            
[106] utf8_1.1.4               KernSmooth_2.23-16       BiocFileCache_1.10.0    
[109] jpeg_0.1-8.1             progress_1.2.2           locfit_1.5-9.1          
[112] grid_3.6.1               data.table_1.12.6        blob_1.2.0              
[115] Rgraphviz_2.30.0         digest_0.6.23            xtable_1.8-4            
[118] brew_1.0-6               openssl_1.4.1            munsell_0.5.0           
[121] askpass_1.1

I am sorry that I may not clearly post my quesion. :)

Best wishes

Guandong Shang

DiffBind • 273 views
ADD COMMENTlink
1
Entering edit mode
Rory Stark ♦ 3.5k
@rory-stark-5741
Last seen 1 day ago
CRUK, Cambridge, UK

There are a number of issues with the way your code does this comparison:

  • The contrast being run compares the two BT474 samples against all seven MCF7 replicates; however the comparison only looks at the first two MCF7 replicates, giving incorrect read totals . I changed this in my tests to do a 2:2 contrast.
  • By default, when you run dba.analyze(), it will apply blacklists and greylists. This also changes the default normalization to not subtract control reads. As a result, when you go back the the original tamoxifen object to check the scores, they are incorrect (you need to access the intermediate dba_analyze object).
  • Typos in the code (Comapre).

I can now run my version of your code and find no anomalies (length(paradox_peak) == 0). However I did identify a bug in how dba.report() calculated count concentrations when there are a lot of zeroes. I am checking in a fix that will appear as DiffBind_3.0.12.

ADD COMMENTlink
0
Entering edit mode

Thanks for reply, Dr Stark.

I am wondering whether you can post your modified code here including the compare and extracting score. I am sorry that I may be not very familiy with new DiffBind :)

Best wishes

Guandong Shang

ADD REPLYlink
0
Entering edit mode

And I am wondering whether you can give me some suggestion about DiffBind v2.14.0. The reason I still want to use DiffBind version before 3.0 is because the R studio in linux do not allow swtich different version R, and I also want to matain the consistency of my previous code.

I found that there are still some paradox_peaks in my own data(my data only have 2 replicate for each type), and it seems that there are no blacklists and greylists in v2.14.0. So I do not know how to eliminate these paradox_peaks.

Thanks for your DiffBind and reply, which is helpful.

Best wishes Guandong Shang

ADD REPLYlink
0
Entering edit mode

I'm not really in a position to support older versions in detail. I do know that the sites in the report should be correct in terms of p-values and FDR; if you want to re-compute the concentrations and fold changes, you can extract the binding matrix and access the scores for each peak in each sample of interest.

Here's my modified version of your code:

library(DiffBind)
library(dplyr)

tamoxifen <- dba(sampleSheet="tamoxifen.csv")
tamoxifen <- dba.count(tamoxifen, summits=250)
tamoxifen <- dba.blacklist(tamoxifen)
tamoxifen$masks$MCF712 <- c(F,F,T,T,F,F,F,F,F,F,F)

group1 = "BT474" 
group2 = "MCF712"
dba_contrast <- dba.contrast(tamoxifen, 
                             group1 = tamoxifen$masks[[group1]],
                             group2 = tamoxifen$masks[[group2]],
                             name1 = group1,
                             name2 = group2)

dba_analyze <- dba.analyze(dba_contrast)

dba_report_all <- dba.report(dba_analyze, 
                             th = 1, 
                             precision = 0,
                             bCounts=TRUE)
dba_report_all$feature_id <- paste0("peak", names(dba_report_all))

compare_1 <- dba_report_all

peakScore_list <- dba_analyze$peaks
names(peakScore_list) <- dba_analyze$samples$SampleID

peakScore <- lapply(peakScore_list, function(x) {x$Score})
peakScore_mt <- do.call(cbind, peakScore)
rownames(peakScore_mt) <- paste0("peak", 1:dim(peakScore_mt)[1])

peakScore_mt[, c("BT4741", "MCF71")] + 
    peakScore_mt[, c("BT4742", "MCF72")] -> mergeData

mergeData %>% 
    as_tibble(rownames = "feature_id") %>% 
    mutate(scoreCompare = (BT4741 > MCF71)) -> scoreCompare

compare_1 %>% 
    as_tibble() %>% 
    select(Conc_BT474, Conc_MCF712, Fold, feature_id) %>% 
    mutate(concCompare = (Fold > 0)) -> concCompare

inner_join(scoreCompare, concCompare) %>% 
    filter(scoreCompare != concCompare) %>% 
    pull(feature_id) -> paradox_peak

length(paradox_peak)

inner_join(scoreCompare, concCompare) %>% 
    filter(scoreCompare != concCompare) %>% 
    arrange(Fold)

report1 <- compare_1[order(as.integer(names(compare_1))),]
ADD REPLYlink
0
Entering edit mode

Thanks for your detailed reply, Dr Stark.

I try to extract raw count of binding matrix and using counts function in DESeq2 to get normCount. But it seems that the count is not very agree with the Conc in final dba.report. Can you give some adivce about my code ?

# DiffBind v2.14.0
peakCount_list <- dba_count$peaks
names(peakCount_list) <- dba_count$samples$SampleID

# get the rawCount of binding
lapply(peakCount_list, function(x) {
  count <- x$Reads - x$cReads
  count[count < 0] = 0

  return(count)
}) %>% 
  do.call(cbind, .) -> rawCount_forDESeq2

# rownames
rownames(rawCount_forDESeq2) <- paste0("peak_", 1:nrow(rawCount_forDESeq2))

# modify colnames
colnames(rawCount_forDESeq2) <- gsub("-", "_", colnames(rawCount_forDESeq2))

# DESeq2  -----------------------------------------------------------------

(data_time <- gsub("_R[12]", "", colnames(rawCount_forDESeq2)))
coldata <- data.frame(row.names = colnames(rawCount_forDESeq2), 
                      time = data_time, 
                      stringsAsFactors = T)

# DESeq2 set ------------------------------------------------------------------
# dds set
dds <- DESeqDataSetFromMatrix(countData = rawCount_forDESeq2, 
                              colData = coldata, 
                              design = ~ time)

dds <- DESeq(dds, 
             fitType='local')

normCount <- counts(dds, normalized = TRUE)
ADD REPLYlink
0
Entering edit mode

If you get the normalized counts using:

dba.peakset(dba_counts, bRetrieve=TRUE)

this should give you the counts used to calculate the concentrations.

ADD REPLYlink
0
Entering edit mode

Thanks Dr Stark, I will try :)

ADD REPLYlink
0
Entering edit mode
@shangguandong1996-21805
Last seen 26 days ago

I know in newer DiffBind version, the default of score argument is different, So I post a newer DiffBind version result.

library(DiffBind)
library(BiocParallel)

library(dplyr)

tamoxifen <- dba(sampleSheet="tamoxifen.csv")
tamoxifen <- dba.count(tamoxifen, summits=250)

compare_function <- function(group1, group2){
  dba_contrast <- dba.contrast(tamoxifen, 
                               group1 = tamoxifen$masks[[group1]],
                               group2 = tamoxifen$masks[[group2]],
                               name1 = group1,
                               name2 = group2)

  dba_analyze <- dba.analyze(dba_contrast)

  dba_report_all <- dba.report(dba_analyze, 
                               th = 1, 
                               precision = 0)
  dba_report_all$feature_id <- paste0("peak", names(dba_report_all))

  return(dba_report_all)
}

compare_1 <- compare_function(group1 = "BT474", 
                              group2 = "MCF7")


# get score result --------------------------------------------------------

peakScore_list <- tamoxifen$peaks
names(peakScore_list) <- tamoxifen$samples$SampleID

peakScore <- lapply(peakScore_list, function(x) {x$Score})
peakScore_mt <- do.call(cbind, peakScore)
rownames(peakScore_mt) <- paste0("peak", 1:dim(peakScore_mt)[1])

peakScore_mt[, c("BT4741", "MCF71")] + 
  peakScore_mt[, c("BT4742", "MCF72")] -> mergeData



# tidy --------------------------------------------------------------------

mergeData %>% 
  as_tibble(rownames = "feature_id") %>% 
  mutate(scoreComapre = (BT4741 > MCF71)) -> scoreCompare

compare_1 %>% 
  as_tibble() %>% 
  select(Conc_BT474, Conc_MCF7, Fold, feature_id) %>% 
  mutate(concCompare = (Fold > 0)) -> concCompare

inner_join(scoreCompare, concCompare) %>% 
  filter(scoreComapre != concCompare) %>% 
  pull(feature_id) -> paradox_peak

> length(paradox_peak)
[1] 1277

> inner_join(scoreCompare, concCompare) %>% 
+   filter(scoreComapre != concCompare) %>% 
+   arrange(Fold)
Joining, by = "feature_id"
# A tibble: 1,277 x 8
   feature_id BT4741  MCF71 scoreComapre Conc_BT474 Conc_MCF7  Fold
   <chr>       <dbl>  <dbl> <lgl>             <dbl>     <dbl> <dbl>
 1 peak188    1.10e1   4.38 TRUE              0.887      6.16 -5.28
 2 peak2401   3.67e1  16.0  TRUE              1.66       6.69 -5.03
 3 peak588    9.80e0   4.65 TRUE              1.25       6.23 -4.98
 4 peak316    1.47e1   4.38 TRUE              3.96       8.83 -4.87
 5 peak1005   2.43e0   0    TRUE              1.24       5.94 -4.70
 6 peak1716   1.05e3 616.   TRUE              1.25       5.86 -4.62
 7 peak484    1.59e1   9.30 TRUE              4.42       8.91 -4.50
 8 peak2713   5.87e1  11.2  TRUE              3.50       7.96 -4.46
 9 peak2400   7.41e0   0    TRUE              0.652      5.03 -4.38
10 peak1904   2.69e1   8.89 TRUE              2.40       6.70 -4.31
# ... with 1,267 more rows, and 1 more variable: concCompare <lgl>
ADD COMMENTlink

Login before adding your answer.

Similar Posts
Loading Similar Posts
Traffic: 217 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.4