Entering edit mode
Hello, I am new to bioinformatic analyses and I am trying to analyse the TCGA dataset to plot a survival curve based on the expression of a gene of interest (say TP53). I have written the following code to analyse the TCGA data, but I am unable to proceed further in my attempt to plot a KM plot for survival for my gene, in this case TP53. Following is the code I have written so far. I would be grateful if someone can help me proceed with my analysis.
Thank you for your valuable time and advice. Manav
Code should be placed in three backticks as shown below
# Load Packages
library(TCGAbiolinks)
library(SummarizedExperiment)
library(EDASeq)
library(edgeR)
library(survminer)
# setwd()
# Build a query to retrieve gene expression data of required samples
query <- GDCquery(project = "TCGA-PAAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts")
samplesDown <- getResults(query,cols=c("cases"))
# Selection of tumor samples "TP"
dataTP <- TCGAquery_SampleTypes(barcode = samplesDown,typesample = "TP")
# Selection of normal samples "NT"
dataNT <- TCGAquery_SampleTypes(barcode = samplesDown,typesample = "NT")
query.selected.samples <- GDCquery(project = "TCGA-PAAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts",
barcode = c(dataTP, dataNT))
# Download data
GDCdownload(query = query.selected.samples)
# Prepare expression matrix with geneID in the rows and samples (barcode) in the columns
dataPrep <- GDCprepare(query = query.selected.samples,
save = TRUE,
save.filename = "TCGA_PAAD.rda")
paad_matrix <- assay(dataPrep, 'fpkm_unstrand')
# For gene expression if you need to see a boxplot correlation and AAIC plot to define outliers you can run
dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep,
cor.cut = 0.6)
# Normalization of genes
dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
geneInfo = geneInfoHT,
method = "gcContent")
boxplot(dataPrep, outline = FALSE)
boxplot(dataNorm, outline = FALSE)
# Quantile filter of genes
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25)
# Differential Expression Analysis (DEA)
dataDEGs_0.01 <- TCGAanalyze_DEA(mat1 = dataFilt[,dataTP],
mat2 = dataFilt[,dataNT],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01,
logFC.cut = 1,
method = "glmLRT")
dataDEGs_0.05 <- TCGAanalyze_DEA(mat1 = dataFilt[,dataTP],
mat2 = dataFilt[,dataNT],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.05,
logFC.cut = 1,
method = "glmLRT")
# DEGs table with expression values in normal and tumor samples
dataDEGsFiltLevel_0.01 <- TCGAanalyze_LevelTab(FC_FDR_table_mRNA = dataDEGs_0.01,
typeCond1 = "Tumor",
typeCond2 = "Normal",
TableCond1 = dataFilt[,dataNT],
TableCond2 = dataFilt[,dataTP])
dataDEGsFiltLevel_0.05 <- TCGAanalyze_LevelTab(FC_FDR_table_mRNA = dataDEGs_0.05,
typeCond1 = "Tumor",
typeCond2 = "Normal",
TableCond1 = dataFilt[,dataNT],
TableCond2 = dataFilt[,dataTP])
sessionInfo( )
R version 4.2.1 (2022-06-23 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8
[3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C
[5] LC_TIME=English_United States.utf8
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] edgeR_3.38.4 limma_3.52.4 EDASeq_2.30.0
[4] ShortRead_1.54.0 GenomicAlignments_1.32.1 Rsamtools_2.12.0
[7] Biostrings_2.64.1 XVector_0.36.0 BiocParallel_1.30.4
[10] SummarizedExperiment_1.26.1 Biobase_2.56.0 GenomicRanges_1.48.0
[13] GenomeInfoDb_1.32.4 IRanges_2.30.1 S4Vectors_0.34.0
[16] BiocGenerics_0.42.0 MatrixGenerics_1.8.1 matrixStats_0.63.0
[19] survminer_0.4.9 ggpubr_0.6.0 ggplot2_3.4.1
[22] survival_3.3-1 TCGAbiolinks_2.24.3
loaded via a namespace (and not attached):
[1] colorspace_2.1-0 rjson_0.2.21 ggsignif_0.6.4
[4] deldir_1.0-6 hwriter_1.3.2.1 ellipsis_0.3.2
[7] rstudioapi_0.14 bit64_4.0.5 AnnotationDbi_1.58.0
[10] fansi_1.0.4 xml2_1.3.3 codetools_0.2-18
[13] splines_4.2.1 R.methodsS3_1.8.2 cachem_1.0.6
[16] knitr_1.42 jsonlite_1.8.4 broom_1.0.3
[19] km.ci_0.5-6 dbplyr_2.3.1 png_0.1-8
[22] R.oo_1.25.0 readr_2.1.4 compiler_4.2.1
[25] httr_1.4.5 backports_1.4.1 Matrix_1.5-3
[28] fastmap_1.1.0 cli_3.6.0 prettyunits_1.1.1
[31] tools_4.2.1 gtable_0.3.1 glue_1.6.2
[34] GenomeInfoDbData_1.2.8 dplyr_1.1.0 rappdirs_0.3.3
[37] Rcpp_1.0.10 carData_3.0-5 vctrs_0.5.2
[40] rtracklayer_1.56.1 xfun_0.37 stringr_1.5.0
[43] rvest_1.0.3 lifecycle_1.0.3 restfulr_0.0.15
[46] rstatix_0.7.2 XML_3.99-0.13 zlibbioc_1.42.0
[49] zoo_1.8-11 scales_1.2.1 aroma.light_3.26.0
[52] hms_1.1.2 parallel_4.2.1 RColorBrewer_1.1-3
[55] yaml_2.3.7 curl_5.0.0 memoise_2.0.1
[58] gridExtra_2.3 KMsurv_0.1-5 downloader_0.4
[61] biomaRt_2.52.0 latticeExtra_0.6-30 stringi_1.7.12
[64] RSQLite_2.3.0 BiocIO_1.6.0 GenomicFeatures_1.48.4
[67] filelock_1.0.2 rlang_1.0.6 pkgconfig_2.0.3
[70] bitops_1.0-7 TCGAbiolinksGUI.data_1.16.0 lattice_0.20-45
[73] purrr_1.0.1 bit_4.0.5 tidyselect_1.2.0
[76] plyr_1.8.8 magrittr_2.0.3 R6_2.5.1
[79] generics_0.1.3 DelayedArray_0.22.0 DBI_1.1.3
[82] pillar_1.8.1 withr_2.5.0 KEGGREST_1.36.3
[85] abind_1.4-5 RCurl_1.98-1.10 tibble_3.1.8
[88] crayon_1.5.2 car_3.1-1 survMisc_0.5.6
[91] interp_1.1-3 utf8_1.2.3 BiocFileCache_2.4.0
[94] tzdb_0.3.0 jpeg_0.1-10 progress_1.2.2
[97] locfit_1.5-9.7 grid_4.2.1 data.table_1.14.8
[100] blob_1.2.3 digest_0.6.31 xtable_1.8-4
[103] tidyr_1.3.0 R.utils_2.12.2 munsell_0.5.0