Hi,
I am trying to analyse and RNASeq data set using DeSeq2. I would like to have the annotated genes in heat maps and also for making a HTML report using ReportingTools, but I am not sure how to pass an already annotated matrix to DESeqDataSetFromMatrix() or how to annotate the results after I call results() on the DeSeq object.
Here are my code and the output:
conddata<-read.csv("Pheno_data_clean.csv", row.names = 1)
conddata
count_matrix<-read.table('Summary/Raw_data_for_genes.csv', sep=',', header=TRUE, row.names = 1)
rawsamples<-c("JK1","JK2","JK3","neg1","neg2","neg3","pos1","pos2","pos3")
count_matrix<-count_matrix[,rawsamples]
annot_matrix<-count_matrixt[,c('EntrezGene','JK1', 'JK2', 'JK3', 'neg1', 'neg2', 'neg3', 'pos1', 'pos2', 'pos3' )]
data_counts<-DESeqDataSetFromMatrix(countData = count_matrix,
colData = conddata,
design = ~Treatment)
data_counts$EntrezGene<-annot_matrix$EntrezGene
data_counts$Treatment <- relevel(data_counts$Treatment, ref="Control")
head(data_counts)
#Perform differential expression analysis
library("AnnotationDbi")
library("org.Ce.eg.db")
data_counts<- DESeq(data_counts)
res<-results(data_counts)
columns(org.Ce.eg.db)
res$symbol <- mapIds(org.Ce.eg.db,
keys=row.names(res),
column="SYMBOL",
keytype="ENSEMBLTRANS",
multiVals="first")
res$entrez <- mapIds(org.Ce.eg.db,
keys=row.names(res),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
rownames(data_counts)
> conddata<-read.csv("Pheno_data_clean.csv", row.names = 1)
> conddata
Strain Treatment
JK1 JK1 PDXJK
JK2 JK2 PDXJK
JK3 JK3 PDXJK
neg1 neg1 Control
neg2 neg2 Control
neg3 neg3 Control
pos1 pos1 UV
pos2 pos2 UV
pos3 pos3 UV
> count_matrix<-read.table('Summary/Raw_data_for_genes.csv', sep=',', header=TRUE, row.names = 1)
> rawsamples<-c("JK1","JK2","JK3","neg1","neg2","neg3","pos1","pos2","pos3")
> count_matrix<-count_matrix[,rawsamples]
> annot_matrix<-count_matrixt[,c('EntrezGene','JK1', 'JK2', 'JK3', 'neg1', 'neg2', 'neg3', 'pos1', 'pos2', 'pos3' )]
> data_counts<-DESeqDataSetFromMatrix(countData = count_matrix,
+ colData = conddata,
+ design = ~Treatment)
> data_counts$Treatment <- relevel(data_counts$Treatment, ref="Control")
> library("AnnotationDbi")
> library("org.Ce.eg.db")
> data_counts<- DESeq(data_counts)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res<-results(data_counts)
> columns(org.Ce.eg.db)
[1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
[5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
[9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
[13] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PMID"
[17] "REFSEQ" "SYMBOL" "UNIGENE" "UNIPROT"
[21] "WORMBASE"
> res$symbol <- mapIds(org.Ce.eg.db,
+ keys=row.names(res),
+ column="SYMBOL",
+ keytype="ENSEMBLTRANS",
+ multiVals="first")
Error in .testForValidKeys(x, keys, keytype, fks) :
None of the keys entered are valid keys for 'ENSEMBLTRANS'. Please use the keys method to see a listing of valid arguments. #I guess my keys are in the wrong format, if I call row names() on data_counts I get info such as: transcript:F31C3.9
> res$entrez <- mapIds(org.Ce.eg.db,
+ keys=row.names(res),
+ column="ENTREZID",
+ keytype="ENSEMBL",
+ multiVals="first")
Error in .testForValidKeys(x, keys, keytype, fks) :
None of the keys entered are valid keys for 'ENSEMBL'. Please use the keys method to see a listing of valid arguments.
> biocLite("org.Ce.eg.db")
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.5 (BiocInstaller 1.26.0), R 3.4.1 (2017-06-30).
Installing package(s) 'org.Ce.eg.db'
installing the source package 'org.Ce.eg.db'
trying URL 'https://bioconductor.org/packages/3.5/data/annotation/src/contrib/org.Ce.eg.db_3.4.1.tar.gz'
Content type 'application/x-gzip' length 19307427 bytes (18.4 MB)
==================================================
downloaded 18.4 MB
* installing *source* package ‘org.Ce.eg.db’ ...
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (org.Ce.eg.db)
The downloaded source packages are in
'/private/var/folders/d4/yy86_b6109d2pm8pvznv6qrh0000gn/T/RtmpJ6dj0i/downloaded_packages'
> library("org.Ce.eg.db")
> desReport<-HTMLReport(shortName='RNASeqAnalysis', title = 'RNASeq_VitB6', reportDirectory = "./reports")
> publish(res, desReport, name="df", countTable=data_counts, pvalueCutoff=0.05, conditions=Treatment, annotation.db="org.Ce.eg", expName="deseq", reportDir="./reports", .modifyDF=makeDESeqDF)
Error: Ids do not appear to be Entrez Ids for the specified species. #I understand that the error here is because res does not have Entrez IDs, but I am really not sure how to pass them
> rownames(data_counts)
[1] "transcript:F31E3.5.1" "transcript:F31C3.9"
[3] "transcript:Y82E9BR.3.3" "transcript:K02B2.5"
[5] "transcript:F02A9.3.1" "transcript:F54D7.7"
R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
Thank you very much, let me know if there is anything I missed. I just need to know how to have the annotations in all the heat maps/graphs/reports.
Best,
Alex

Hi,
Thank you for your reply and for correcting the tags. I was aware that my keys were in a different format (or so I think)- e.g. "transcript:F31C3.9" instead of something like "B0240b". My problem is not being able to add Entrez Gene IDs to the DeSeq object.
The initial raw count table I pass to the DESeqDataSetFromMatrix comes from Ballgown and has the transcript ID as the ID column (the first one), so this is what is passed into a DeSeq object along with my 9 samples.
In short, I need to publish a report of differential gene expression of the annotated counts and I am having troubles with the annotating and the publishing. I'm very new to this method (and fairly new-ish to R, sorry if I'm missing some obvious things).
Thanks a lot!
Here is a more explicit example.
> ids <- c("transcript:F31E3.5.1", "transcript:F31C3.9", "transcript:Y82E9BR.3.3", "transcript:K02B2.5", "transcript:F02A9.3.1", "transcript:F54D7.7") > ids <- gsub("transcript:", "", ids) > ids [1] "F31E3.5.1" "F31C3.9" "Y82E9BR.3.3" "K02B2.5" "F02A9.3.1" [6] "F54D7.7" > select(org.Ce.eg.db, ids, "ENTREZID", "ENSEMBLTRANS") Error in .testForValidKeys(x, keys, keytype, fks) : None of the keys entered are valid keys for 'ENSEMBLTRANS'. Please use the keys method to see a listing of valid arguments. > ids <- sapply(strsplit(ids, "\\."), "[", 1) > ids [1] "F31E3" "F31C3" "Y82E9BR" "K02B2" "F02A9" "F54D7" > select(org.Ce.eg.db, ids, "ENTREZID", "ENSEMBLTRANS") 'select()' returned 1:many mapping between keys and columns ENSEMBLTRANS ENTREZID 1 F31E3 3565545 2 F31E3 175975 3 F31E3 175976 4 F31E3 175974 5 F31E3 185154 6 F31C3 173374 7 F31C3 173375 8 F31C3 173376 9 F31C3 185146 10 Y82E9BR 175302 11 Y82E9BR 190766 12 Y82E9BR 175299 13 Y82E9BR 190767 14 Y82E9BR 175296 15 Y82E9BR 190768 16 Y82E9BR 190769 17 Y82E9BR 190770 18 Y82E9BR 190771 19 K02B2 177365 20 K02B2 177363 21 K02B2 177362 22 K02B2 186865 23 F02A9 176284 24 F02A9 176286 25 F02A9 184071 26 F02A9 3564852 27 F54D7 172096 28 F54D7 172097One problem here is that there are multiple Entrez Gene IDs for each of the Ensembl transcript IDs. This is sort of the nature of the beast, however. NCBI and EBI don't always agree on things, so mapping IDs between the two groups can be, um, painful.