Search
Question: Annotate gene counts
0
gravatar for Alexandra
3 months ago by
Alexandra0
Alexandra0 wrote:

 

Hi,

I am trying to analyse and RNASeq data set using DeSeq2. I would like to have the annotated genes in heat maps and also for making a HTML report using ReportingTools, but I am not sure how to pass an already annotated matrix to DESeqDataSetFromMatrix() or how to annotate the results after I call results() on the DeSeq object. 

Here are my code and the output:

conddata<-read.csv("Pheno_data_clean.csv", row.names = 1)
conddata

count_matrix<-read.table('Summary/Raw_data_for_genes.csv', sep=',', header=TRUE, row.names = 1)

rawsamples<-c("JK1","JK2","JK3","neg1","neg2","neg3","pos1","pos2","pos3")

count_matrix<-count_matrix[,rawsamples]
annot_matrix<-count_matrixt[,c('EntrezGene','JK1', 'JK2', 'JK3', 'neg1', 'neg2', 'neg3', 'pos1', 'pos2', 'pos3' )]

data_counts<-DESeqDataSetFromMatrix(countData = count_matrix,
                                    colData = conddata,
                                    design = ~Treatment)

data_counts$EntrezGene<-annot_matrix$EntrezGene 

data_counts$Treatment <- relevel(data_counts$Treatment, ref="Control")

head(data_counts)

#Perform differential expression analysis
library("AnnotationDbi")
library("org.Ce.eg.db")
data_counts<- DESeq(data_counts)
res<-results(data_counts)
columns(org.Ce.eg.db)


res$symbol <- mapIds(org.Ce.eg.db, 
                     keys=row.names(res), 
                     column="SYMBOL", 
                     keytype="ENSEMBLTRANS",
                     multiVals="first")
res$entrez <- mapIds(org.Ce.eg.db, 
                     keys=row.names(res), 
                     column="ENTREZID", 
                     keytype="ENSEMBL",
                     multiVals="first")
rownames(data_counts)

> conddata<-read.csv("Pheno_data_clean.csv", row.names = 1)
> conddata
     Strain Treatment
JK1     JK1     PDXJK
JK2     JK2     PDXJK
JK3     JK3     PDXJK
neg1   neg1   Control
neg2   neg2   Control
neg3   neg3   Control
pos1   pos1        UV
pos2   pos2        UV
pos3   pos3        UV
> count_matrix<-read.table('Summary/Raw_data_for_genes.csv', sep=',', header=TRUE, row.names = 1)
> rawsamples<-c("JK1","JK2","JK3","neg1","neg2","neg3","pos1","pos2","pos3")
> count_matrix<-count_matrix[,rawsamples]
> annot_matrix<-count_matrixt[,c('EntrezGene','JK1', 'JK2', 'JK3', 'neg1', 'neg2', 'neg3', 'pos1', 'pos2', 'pos3' )]
> data_counts<-DESeqDataSetFromMatrix(countData = count_matrix,
+                                     colData = conddata,
+                                     design = ~Treatment)
> data_counts$Treatment <- relevel(data_counts$Treatment, ref="Control")
> library("AnnotationDbi")
> library("org.Ce.eg.db")
> data_counts<- DESeq(data_counts)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res<-results(data_counts)
> columns(org.Ce.eg.db)
 [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
 [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
 [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
[13] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PMID"        
[17] "REFSEQ"       "SYMBOL"       "UNIGENE"      "UNIPROT"     
[21] "WORMBASE"    
> res$symbol <- mapIds(org.Ce.eg.db, 
+                      keys=row.names(res), 
+                      column="SYMBOL", 
+                      keytype="ENSEMBLTRANS",
+                      multiVals="first")
Error in .testForValidKeys(x, keys, keytype, fks) : 
  None of the keys entered are valid keys for 'ENSEMBLTRANS'. Please use the keys method to see a listing of valid arguments.  #I guess my keys are in the wrong format, if I call row names() on data_counts I get info such as: transcript:F31C3.9
> res$entrez <- mapIds(org.Ce.eg.db, 
+                      keys=row.names(res), 
+                      column="ENTREZID", 
+                      keytype="ENSEMBL",
+                      multiVals="first")
Error in .testForValidKeys(x, keys, keytype, fks) : 
  None of the keys entered are valid keys for 'ENSEMBL'. Please use the keys method to see a listing of valid arguments.
> biocLite("org.Ce.eg.db")
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.5 (BiocInstaller 1.26.0), R 3.4.1 (2017-06-30).
Installing package(s) 'org.Ce.eg.db'
installing the source package 'org.Ce.eg.db'

trying URL 'https://bioconductor.org/packages/3.5/data/annotation/src/contrib/org.Ce.eg.db_3.4.1.tar.gz'
Content type 'application/x-gzip' length 19307427 bytes (18.4 MB)
==================================================
downloaded 18.4 MB

* installing *source* package ‘org.Ce.eg.db’ ...
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (org.Ce.eg.db)

The downloaded source packages are in
    '/private/var/folders/d4/yy86_b6109d2pm8pvznv6qrh0000gn/T/RtmpJ6dj0i/downloaded_packages'
> library("org.Ce.eg.db")
> desReport<-HTMLReport(shortName='RNASeqAnalysis', title = 'RNASeq_VitB6', reportDirectory = "./reports")
> publish(res, desReport, name="df", countTable=data_counts, pvalueCutoff=0.05, conditions=Treatment, annotation.db="org.Ce.eg", expName="deseq", reportDir="./reports", .modifyDF=makeDESeqDF)
Error: Ids do not appear to be Entrez Ids for the specified species.  #I understand that the error here is because res does not have Entrez IDs, but I am really not sure how to pass them
> rownames(data_counts)
   [1] "transcript:F31E3.5.1"     "transcript:F31C3.9"      
   [3] "transcript:Y82E9BR.3.3"   "transcript:K02B2.5"      
   [5] "transcript:F02A9.3.1"     "transcript:F54D7.7"    

 

R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.5

Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

Thank you very much, let me know if there is anything I missed. I just need to know how to have the annotations in all the heat maps/graphs/reports.

Best,

Alex

ADD COMMENTlink modified 3 months ago by Michael Love14k • written 3 months ago by Alexandra0
1
gravatar for Michael Love
3 months ago by
Michael Love14k
United States
Michael Love14k wrote:

Hi, 

This isn't a question about the DESeq2 package but about using the annotation packages in Bioconductor. All the code about DESeq2 isn't necessary to dive into the problem. So I'm going to remove the "DESeq2" tag and add "AnnotationDbi", which is the package you are using to look up annotations. You can see that package vignette for instructions on using annotations in Bioconductor.

Note if you take a look at the first error you got, it is informative:

None of the keys entered are valid keys for 'ENSEMBLTRANS'. 
Please use the keys method to see a listing of valid arguments.

You can compare what you provided as keys, with what the annotation package has as keys, and see where to go from there.

head(row.names(res))

head(keys(org.Ce.eg.db, keytype="ENSEMBLTRANS"))

 

ADD COMMENTlink written 3 months ago by Michael Love14k

Hi,

Thank you for your reply and for correcting the tags. I was aware that my keys were in a different format (or so I think)- e.g. "transcript:F31C3.9" instead of something like "B0240b". My problem is not being able to add Entrez Gene IDs to the DeSeq object.

The initial raw count table I pass to the DESeqDataSetFromMatrix comes from Ballgown and has the transcript ID as the ID column (the first one), so this is what is passed into a DeSeq object along with my 9 samples. 

In short, I need to publish a report of differential gene expression of the annotated counts and I am having troubles with the annotating and the publishing. I'm very new to this method (and fairly new-ish to R, sorry if I'm missing some obvious things).

Thanks a lot!

ADD REPLYlink written 3 months ago by Alexandra0

Here is a more explicit example.

> ids <- c("transcript:F31E3.5.1",  "transcript:F31C3.9", "transcript:Y82E9BR.3.3",   "transcript:K02B2.5",  "transcript:F02A9.3.1",     "transcript:F54D7.7")
> ids <- gsub("transcript:", "", ids)
> ids
[1] "F31E3.5.1"   "F31C3.9"     "Y82E9BR.3.3" "K02B2.5"     "F02A9.3.1"  
[6] "F54D7.7"    
> select(org.Ce.eg.db, ids, "ENTREZID", "ENSEMBLTRANS")
Error in .testForValidKeys(x, keys, keytype, fks) :
  None of the keys entered are valid keys for 'ENSEMBLTRANS'. Please use the keys method to see a listing of valid arguments.
> ids <- sapply(strsplit(ids, "\\."), "[", 1)
> ids
[1] "F31E3"   "F31C3"   "Y82E9BR" "K02B2"   "F02A9"   "F54D7"  
> select(org.Ce.eg.db, ids, "ENTREZID", "ENSEMBLTRANS")
'select()' returned 1:many mapping between keys and columns
   ENSEMBLTRANS ENTREZID
1         F31E3  3565545
2         F31E3   175975
3         F31E3   175976
4         F31E3   175974
5         F31E3   185154
6         F31C3   173374
7         F31C3   173375
8         F31C3   173376
9         F31C3   185146
10      Y82E9BR   175302
11      Y82E9BR   190766
12      Y82E9BR   175299
13      Y82E9BR   190767
14      Y82E9BR   175296
15      Y82E9BR   190768
16      Y82E9BR   190769
17      Y82E9BR   190770
18      Y82E9BR   190771
19        K02B2   177365
20        K02B2   177363
21        K02B2   177362
22        K02B2   186865
23        F02A9   176284
24        F02A9   176286
25        F02A9   184071
26        F02A9  3564852
27        F54D7   172096
28        F54D7   172097

One problem here is that there are multiple Entrez Gene IDs for each of the Ensembl transcript IDs. This is sort of the nature of the beast, however. NCBI and EBI don't always agree on things, so mapping IDs between the two groups can be, um, painful.

ADD REPLYlink written 3 months ago by James W. MacDonald45k
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.2.0
Traffic: 160 users visited in the last hour