Question

Annotating rice affymetrix data from different gse through biomart

0

Entering edit mode

pratikshasharma47 • 0

@pratikshasharma47-20320

Last seen 5.0 years ago

I have merged 5 datasets from same platform i.e. GPL2025 how to annotate those data sets for gene expression analysis?

annotation go • 729 views

ADD COMMENT • link updated 5.1 years ago by James W. MacDonald 65k • written 5.1 years ago by pratikshasharma47 • 0

score 1 · Answer 1 · 2019-03-27

> library(biomaRt)
> mart <- useEnsembl("plants_mart", "osativa_eg_gene",host = "plants.ensembl.org")
> z <- getGEO("GSE3053")[[1]]
<stuff happens>
> z
ExpressionSet (storageMode: lockedEnvironment)
assayData: 57381 features, 11 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: GSM67052 GSM67053 ... GSM67062 (11 total)
  varLabels: title geo_accession ... data_row_count (31 total)
  varMetadata: labelDescription
featureData
  featureNames: AFFX-BioB-3_at AFFX-BioB-5_at ... RPTR-Os-XXU09476-1_at
    (57381 total)
  fvarLabels: ID GB_ACC ... Gene Ontology Molecular Function (16 total)
  fvarMetadata: Column Description labelDescription
experimentData: use 'experimentData(object)'
  pubMedIds: 16183841 
Annotation: GPL2025 

## get data from biomaRt
> annot <- getBM(c("affy_rice","ensembl_gene_id","entrezgene","external_gene_name"), "affy_rice",featureNames(z), mart)

## re-order to match ExpressionSet

> annot2 <- data.frame(PROBEID = featureNames(z), annot[match(featureNames(z), annot[,1]),])
> annot2[290:300,]
               PROBEID          affy_rice ensembl_gene_id entrezgene
365   Os.10159.1.S1_at   Os.10159.1.S1_at    Os03g0844100    4334756
109   Os.10160.1.S1_at   Os.10160.1.S1_at    Os05g0514300    4339307
62    Os.10162.1.S1_at   Os.10162.1.S1_at    Os01g0384800    4325431
26    Os.10164.1.S1_at   Os.10164.1.S1_at    Os09g0527900    4347644
283   Os.10166.1.S1_at   Os.10166.1.S1_at    Os04g0494100    4336265
138   Os.10167.1.S1_at   Os.10167.1.S1_at    Os02g0779500         NA
140 Os.10167.1.S1_s_at Os.10167.1.S1_s_at    Os02g0779500         NA
139 Os.10167.1.S1_x_at Os.10167.1.S1_x_at    Os02g0779500         NA
28    Os.10168.1.S1_at   Os.10168.1.S1_at    Os12g0438000    4352128
208   Os.10169.1.S1_at   Os.10169.1.S1_at    Os06g0647400    4341667
213   Os.10171.1.S1_at   Os.10171.1.S1_at    Os03g0103100    4331301
                                                              external_gene_name
365                                         RECEPTOR-LIKE CYTOPLASMIC KINASE 123
109               tubby-like protein 10, tubby-like protein 9, F-box protein 267
62                                                                              
26  B-box-containing protein 29, DOUBLE B-BOX zinc finger gene 1, DOUBLE B-BOX 1
283                                                                  CHITINASE 5
138                                                                             
140                                                                             
139                                                                             
28                                                                              
208       Lysosomal Pro-x Carboxypeptidase 2, LYSOSOMAL PRO-X CARBOXYPEPTIDASE 2
213                                    hybrid proline- or glycine-rich protein 3
## extract existing fData and swap new in
> fd <- fData(z)
> fData(z) <- annot2

And do note that there are useful data in the fData slot to begin with:

> fd[290:300,8]
 [1] "AK067164.1" "AK061747.1" "AK119688.1" "AK122172.1" "AB096140.1"
 [6] "AK063877.1" "AK063877.1" "AK063877.1" "AK071511.1" "AK068457.1"
[11] "AY466108.1"

Which may be useful

> library(AnnotationHub)
> hub <- AnnotationHub()
> query(hub, c("sativa","OrgDb"))

            title                                               
  AH66184 | org.Camelina_sativa.eg.sqlite                       
  AH66238 | org.Lactuca_sativa.eg.sqlite                        
  AH66306 | org.Oryza_sativa_(japonica_cultivar-group).eg.sqlite
  AH66307 | org.Oryza_sativa_Japonica_Group.eg.sqlite           
  AH66308 | org.Oryza_sativa_subsp._japonica.eg.sqlite          
> zz <- hub[["AH66307"]]
> ids <- fd[,8]
> annot3 <- select(zz, ids, c("ENTREZID","SYMBOL","GENENAME"), "ACCNUM")
'select()' returned many:1 mapping between keys and columns
> annot3[290:300,]
        ACCNUM ENTREZID     SYMBOL                            GENENAME
290 AK067164.1  4334756 LOC4334756 PTI1-like tyrosine-protein kinase 1
291 AK061747.1  4339307 LOC4339307          tubby-like F-box protein 9
292 AK119688.1  4325431 LOC4325431          uncharacterized LOC4325431
293 AK122172.1  4347644 LOC4347644        B-box zinc finger protein 18
294 AB096140.1  4336265 LOC4336265                    chitinase 5-like
295 AK063877.1     <NA>       <NA>                                <NA>
296 AK063877.1     <NA>       <NA>                                <NA>
297 AK063877.1     <NA>       <NA>                                <NA>
298 AK071511.1  4352128 LOC4352128              probable histone H2A.7
299 AK068457.1  4341667 LOC4341667    lysosomal Pro-X carboxypeptidase
300 AY466108.1  4331301 LOC4331301   cortical cell-delineating protein

And do note that putting the annotation into the fData slot ensures that limma will use those data when generating topTable output.